# Assignment 1: RISC-V Assembly and Instruction Pipeline contributed by < [kuohanwen](https://github.com/kuohanwen/ca2025-quizzes) > [Homework1 Requirment](https://hackmd.io/@sysprog/2025-arch-homework1) ## Problem B Refer to [Quiz1 of Computer Architecture (2025 Fall) Problem B](https://hackmd.io/@sysprog/arch2025-quiz1-sol) ### Description of Problem Use an 8-bit `uf8` format (4-bit exponent + 4-bit mantissa) to approximately represent 20-bit unsigned integers, satisfying the relative error is **≤ 1/16**. --- ### C Code of Decoding (8bits → 32bits) ```c= // Decode an 8-bit uf8 code [eeee mmmm] into a 32-bit unsigned integer. // The decoded value = (m << e) + 16 * (2^e − 1) uint32_t uf8_decode(uf8 fl) { uint32_t mantissa = fl & 0x0f; // Extract the 4-bit mantissa (bits [3:0]). uint8_t exponent = fl >> 4; // Extract the 4-bit exponent (bits [7:4]). uint32_t offset = (0x7FFF >> (15 - exponent)) << 4; // Compute offset(e) = 16 * (2^15 − 1)* (2^-(15-e)) ~= 16 * (2^e − 1) return (mantissa << exponent) + offset; // Final decoded integer: mantissa * 2^e + offset(e) } ``` Decoding Method $$ D(b) = m \cdot 2^e + (2^e - 1) \cdot 16 $$ Input:8-bit uf8 value b Output:20-bit unsigned integer D(b) ### C Code of Encoding (32bits → 8bits) ```c= // Encode a non-negative 32-bit integer into an 8-bit uf8 code [eeee mmmm] uf8 uf8_encode(uint32_t value) { // Small values (0..15) are exact in uf8 (e=0) can return directly if (value < 16) return value; /* Fast exponent e hint using CLZ: lz = leading zeros in value ; msb = 31 - lz~= floor(log2(value)) */ int lz = clz(value); int msb = 31 - lz; /* Give a initial guess overflow will hold offset(e) = 16*(2^e - 1). */ uint8_t exponent = 0; uint32_t overflow = 0; /* for values ≥ 32 the number is large enough that using the log₂-based approximation becomes worthwhile 16~31 start with exponent = 0 and overflow = 0 */ if (msb >= 5) { // Empirical initial guess: e ≈ msb - 4 exponent = msb - 4; // e=0~15 if (exponent > 15) exponent = 15; /* Compute offset(e) iteratively: 16*(2^e - 1)=16*(2^(e - 1) - 1) * 2 + 16 offset(e) = offset(e-1)*2 + 16*/ for (uint8_t e = 0; e < exponent; e++) overflow = (overflow << 1) + 16; /* If the estimate overshot (value < offset(e)), step e down while undoing the recurrence: offset(e-1) = (offset(e) - 16)/2 result will be (value > offset(e))*/ while (exponent > 0 && value < overflow) { overflow = (overflow - 16) >> 1; exponent--; } } /* Move e upward until value < offset(e+1) solve the condition that value >... >offset(e+1)> offset(e) 16~31 will be revised to exponent = 1 and overflow = 16 Then result will be offset(e) ≤ value < offset(e+1)*/ while (exponent < 15) { uint32_t next_overflow = (overflow << 1) + 16; if (value < next_overflow) break; overflow = next_overflow; exponent++; } // m = floor((value - offset(e)) / 2^e) uint8_t mantissa = (value - overflow) >> exponent; // b = [eeee mmmm] return (exponent << 4) | mantissa; } ``` Encoding Method $$ E(v) = \begin{cases} v, & \text{if } v < 16 \\ 16e + \lfloor(v - \text{offset}(e))/2^e\rfloor, & \text{otherwise} \end{cases} $$ where $$\text{offset}(e) = (2^e - 1) \cdot 16$$ Input:20-bit unsigned integer v Output:8-bit uf8 value E(v) ### Full RISC-V Assembly Code ```c= .data # offset(e) = 16*(2^e - 1), e=0..15 offset_tbl: .word 0,16,48,112,240,496,1008,2032 .word 4080,8176,16368,32752,65520,131056,262128,524272 msg1: .asciz ": decodes to " msg2: .asciz " but re-encodes as " msg3: .asciz ": decoded is " msg4: .asciz " <= previous value " msg5: .asciz "All tests passed.\n" msg6: .asciz "At least one test failed.\n" newline: .asciz "\n" .align 2 .text .globl main main: jal ra, test # start test beq a0, x0, Not_pass # failed la a0, msg5 # print msg5 when passing li a7, 4 ecall li a7, 10 # ecall: exit li a0, 0 # exit code 0 (success) ecall Not_pass: la a0, msg6 # print msg6 when not passing li a7, 4 ecall li a7, 10 # ecall: exit li a0, 1 # exit code 1 (failure) ecall test: addi sp, sp, -32 sw ra, 28(sp) sw s0, 24(sp) sw s1, 20(sp) sw s2, 16(sp) sw s3, 12(sp) sw s4, 8(sp) sw s5, 4(sp) addi s0, x0, -1 # previous_value li s1, 1 # passed = true li s2, 0 # fl (0..255) li s3, 256 # loop end For_2: add a0, s2, x0 # a0 = fl jal ra, uf8_decode add s4, a0, x0 # value add a0, s4, x0 # a0 = value jal ra, uf8_encode add s5, a0, x0 # fl2 # (A) round-trip check test_if_1: beq s2, s5, test_if_2 add a0, s2, x0 # print fl (hex) li a7, 34 ecall la a0, msg1 li a7, 4 ecall add a0, s4, x0 # print value (decimal) li a7, 1 ecall la a0, msg2 li a7, 4 ecall add a0, s5, x0 # print fl2 (hex) li a7, 34 ecall la a0, newline li a7, 4 ecall li s1, 0 # passed = false # (B) strict monotonic increase test_if_2: blt s0, s4, after_if add a0, s2, x0 li a7, 34 ecall la a0, msg3 li a7, 4 ecall add a0, s4, x0 li a7, 1 ecall la a0, msg4 li a7, 4 ecall add a0, s0, x0 li a7, 34 ecall la a0, newline li a7, 4 ecall li s1, 0 after_if: add s0, s4, x0 addi s2, s2, 1 blt s2, s3, For_2 add a0, s1, x0 # return passed (1/0) lw ra, 28(sp) lw s0, 24(sp) lw s1, 20(sp) lw s2, 16(sp) lw s3, 12(sp) lw s4, 8(sp) lw s5, 4(sp) addi sp, sp, 32 jr ra #CLZ (fixed 16/8/4/2/1; clz(0)=32) CLZ: beq a0, x0, CLZ_ZERO li t0, 0 # bitlen-1 accumulator srli t1, a0, 16 beq t1, x0, CLZ_1 addi t0, t0, 16 add a0, t1, x0 CLZ_1: srli t1, a0, 8 beq t1, x0, CLZ_2 addi t0, t0, 8 add a0, t1, x0 CLZ_2: srli t1, a0, 4 beq t1, x0, CLZ_3 addi t0, t0, 4 add a0, t1, x0 CLZ_3: srli t1, a0, 2 beq t1, x0, CLZ_4 addi t0, t0, 2 add a0, t1, x0 CLZ_4: srli t1, a0, 1 beq t1, x0, CLZ_5 addi t0, t0, 1 CLZ_5: li t1, 31 sub a0, t1, t0 # clz = 31 - (bitlen-1) jr ra CLZ_ZERO: li a0, 32 jr ra # uf8_decode # a0 = uf8 code -> a0 = decoded integer uf8_decode: andi t0, a0, 0x0F # m srli t1, a0, 4 # e la t2, offset_tbl slli t3, t1, 2 # index = e * 4 (word offset) add t2, t2, t3 lw t2, 0(t2) # t2 = offset(e) sll t0, t0, t1 # m << e add a0, t0, t2 # value = offset + (m << e) jr ra # uf8_encode # a0 = value -> a0 = uf8 code uf8_encode: addi sp, sp, -4 sw ra, 0(sp) add t6, a0, x0 # t6 = value # Small values: exact (e=0) li t0, 16 blt t6, t0, UF8ENC_RET # a0 already holds value # e = floor_log2( (value + 16) >> 4 ) = 31 - clz(t) addi t1, t6, 16 srli t1, t1, 4 add a0, t1, x0 jal ra, CLZ li t2, 31 sub t2, t2, a0 # t2 = e # clamp e to 15 li t0, 15 blt t2, t0, 1f li t2, 15 1: li t3, 1 # t3 = pow2 add t1, t2, x0 # t1 = e (loop counter) 2: beq t1, x0, 3f slli t3, t3, 1 # pow2 <<= 1 addi t1, t1, -1 j 2b 3: # off = ((1 << e) - 1) << 4 addi t3, t3, -1 slli t3, t3, 4 # t3 = off # m = (value - off) >> e (single shift; then saturate) sub t4, t6, t3 srl t4, t4, t2 li t0, 15 blt t4, t0, 4f li t4, 15 4: slli t5, t2, 4 or a0, t5, t4 # a0 = (e<<4) | m UF8ENC_RET: lw ra, 0(sp) addi sp, sp, 4 jr ra ``` ### Explain of RISC-V Assembly Code ##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">main</span> main is the program entry that orchestrates the run: it jumps to test, which returns a pass/fail flag in a0 (1 = pass, 0 = fail). After the call, main inspects a0; if it is non-zero, it prints msg5 (“All tests passed.\n”) using the RARS/Ripes print-string syscall (place the string address in a0, set a7=4, then ecall) and terminates with the exit syscall (a7=10) using exit code 0. If a0 is zero, it prints msg6 (“At least one test failed.\n”) in the same way and exits with code 1. Throughout, a0 serves as both the test result and the syscall argument register, per the calling convention. ##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">Test</span> test verifies the uf8 codec end-to-end over all 256 byte codes. It first creates a 32-byte stack frame and saves ra and the callee-saved registers it will use. It then initializes its state: s0 = -1 (previous decoded value), s1 = 1 (passed flag), s2 = 0 (current uf8 code, 0…255), and s3 = 256 (loop limit). For each fl = s2 it calls uf8_decode to get value (s4), then calls uf8_encode(value) to get fl2 (s5). Two checks are performed: (A)<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Round-trip</span> if fl2 != fl, it prints a diagnostic line showing fl (hex), the decoded value (decimal), and the re-encoded fl2 (hex), and clears s1 to 0. (B)<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Strict monotonicity</span> if value <= previous_value (s0), it prints a diagnostic line fl: value <=' previous_value (with fl and previous_value in hex) and clears s1. After the checks it updates previous_value = value, increments fl, and repeats until fl == 256. At exit it returns the pass flag in a0 (a0 = s1, 1 = all tests passed, 0 = at least one failure), restores ra and s0..s5, pops the stack frame, and jr ra. ##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">CLZ</span>— Count Leading Zeros 1.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Purpose</span> Returns the number of consecutive zero bits before the most-significant 1 in a 32-bit unsigned integer. By convention clz(0) = 32. The routine is used by uf8_encode to compute 2.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Inputs / Outputs</span> Input: a0 — 32-bit unsigned value. Output: a0 — integer in [0,32], the count of leading zeros. Registers: t0 (accumulator for “bitlen−1”), t1 (temporary shifts). ra is not modified (leaf function). 3.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Algorithm</span> (1)If a0 == 0, return 32 immediately. (2)Initialize t0 = 0 (will accumulate the index of the highest set bit). (3)Probe blocks of bits from MSB to LSB with fixed right shifts: Test upper 16 bits: t1 = a0 >> 16. If non-zero, the MSB lies in that half → add 16 to t0, replace a0 = t1. Else, keep a0 as is (MSB lies in the lower half). Repeat with shifts 8, 4, 2, 1 using the same rule, each time adding the block size to t0 if the probed part is non-zero and narrowing a0 to that sub-range. (4)At the end, t0 equals (bitlen − 1), so return 31 - t0 in a0. 4.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Benefit</span> (1)RV32I-only: uses srli, beq, addi, add, li, sub, jr—no M/F/D extensions. (2)Constant time: always performs at most 5 probes; no data-dependent loops → predictable timing. (3)Efficient: far fewer branches than naive 32-iteration scans, and avoids multiplication/division. (4)Time Complexity:O(1) — exactly five conditional tests after the zero check. 5.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Edge cases & assumptions</span> (1)Input is treated as unsigned; if callers pass a signed value, it is interpreted bitwise (two’s complement). (2)clz(0) is explicitly defined as 32 to avoid a special “no MSB” case. (3)Because it’s a leaf routine, it does not touch the stack or ra. 6.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Example</span> a0 = 0x0010_0000 a0 != 0 → do not return 32. Initialize t0 = 0 (will accumulate the index of the highest set bit). Probe from MSB to LSB with fixed right shifts: Shift 16: t1 = a0 >> 16 = 0x0000_0010 (non-zero). ⇒ MSB is in this half → t0 += 16 → t0 = 16; set a0 = t1 = 0x10. Shift 8: t1 = a0 >> 8 = 0x0000_0000 (zero). ⇒ Stay in lower half → keep a0 = 0x10; t0 unchanged (16). Shift 4: t1 = a0 >> 4 = 0x0000_0001 (non-zero). ⇒ Add 4 → t0 = 20; set a0 = t1 = 0x1. Shift 2: t1 = a0 >> 2 = 0x0000_0000 (zero). ⇒ Keep a0 = 0x1; t0 unchanged (20). Shift 1: t1 = a0 >> 1 = 0x0000_0000 (zero). ⇒ Keep a0 = 0x1; t0 unchanged (20). Finish: t0 = 20 (bitlen − 1), so return a0 = 31 − t0 = 31 − 20 = 11. Result: clz(0x0010_0000) = 11 (there are 11 leading zeros in the 32-bit representation). ##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">uf8_decode</span> 1.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Purpose</span> Convert an 8-bit uf8 code [eeee|mmmm] (4-bit exponent e, 4-bit mantissa m) into its 32-bit unsigned representative. 2.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">How it works</span> Extract e = b >> 4 and m = b & 0x0F. Load the interval start from a lookup table: off = offset_tbl[e] where offset_tbl[e] = 16*(2^e − 1). Return offset[e] + (m << e); i.e., $$ D(b) = m \cdot 2^e + (2^e - 1) \cdot 16 $$ 3.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Input / Output</span> Input: b (0–255) — an 8-bit uf8 code. Output: 32-bit unsigned integer D(b). 4.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Why the lookup table helps</span> (1)Fixed and tiny cost (O(1)) offset(e) has only 16 entries (e = 0..15), so the table is 16 × 4B = 64 bytes. One indexed load + one add + one shift finishes the job. Latency is stable with no data-dependent loops. (2)Stable performance Compared with recomputing offset(e) as ((1<<e) - 1) << 4 or (0x7FFF >> (15 - e)) << 4, the lookup avoids extra “variable shifts + sub/bit-twiddling” sequences. On embedded/pipelined cores this means fewer instructions, shorter dependency chains, and tighter timing dispersion—great for real-time systems. (3)Avoids overflow and boundary pitfalls Computing ((1<<e) - 1) << 4 at runtime can hit shift-width limits or immediate encoding quirks when porting to different word sizes or variants. With a lookup, all boundaries are pre-fixed in the table, so you don’t encounter those runtime edge cases. 5.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Example</span> b = 0x1A → e=1, m=10, offset(1)=16, m<<e=20 → result 36. ##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">uf8_encode</span> 1.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Objective</span> Encode a 20-bit unsigned integer value into an 8-bit uf8 code [eeee|mmmm] 2.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">How it works</span> (1)Small values passthrough: if value < 16, return value exactly (e=0). (2)Find the exponent: compute t = (value + 16) >> 4, then e = 31 − clz(t); clamp e to 15. (3)Build the interval start: construct pow2 = (1 << e) via a tiny loop, then off = (pow2 − 1) << 4. (4)Compute the mantissa: m = (value − off) >> e, saturate to 15 if needed. (5)Pack and return: (e << 4) | m in a0. 3.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Benefit</span> (1)Monotonic and round-trip consistent: codes increase strictly with value; for any code b, encode(decode(b)) == b (2)predictable error: 0..15 are exact; beyond that the step is 2^e, giving a worst-case relative error ≤ 1/16 (6.25%). (3)Robust with saturation: clamping e and m to 15 ensures every input maps to a valid 8-bit code,there is no undefined behavior. 4.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Example</span> Input: value = 36 Compute t = (value + 16) >> 4 = (36 + 16) >> 4 = 52 >> 4 = 3. Exponent via CLZ: clz(3) = 30 ⇒ e = 31 − 30 = 1 (clamp not needed). Build pow2 = (1 << e) via the small loop: start at 1, shift once ⇒ pow2 = 2. Interval start: off = (pow2 − 1) << 4 = (2 − 1) << 4 = 1 << 4 = 16. Mantissa: m = (value − off) >> e = (36 − 16) >> 1 = 20 >> 1 = 10. Pack: uf8 = (e << 4) | m = (1 << 4) | 10 = 0x10 | 0x0A = 0x1A. ### Experiment and Result ##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">Improved Approach</span> Cycle count=35402 CPI=1.46 ![B_Result](https://hackmd.io/_uploads/B1U4iVcTgg.png) ![Question_B](https://hackmd.io/_uploads/Sk2_7r5axl.png) ##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">Original Approach</span> Cycle count=41802 CPI=1.49 ![B_result2](https://hackmd.io/_uploads/S1td4rqTeg.png) ![questionB_result](https://hackmd.io/_uploads/H16WBS9Txl.png) ### Improvement By using `lookup table approach`,cycles reduced about (41802-35402)/41802=15% and instructions reduced about (28062-24222)/28062=13% ## Problem C Refer to [Quiz1 of Computer Architecture (2025 Fall) Problem C](https://hackmd.io/@sysprog/arch2025-quiz1-sol) ### Description of Problem Implement addition, subtraction, multiplication, division and square root bfloat16 format ``` ┌─────────┬──────────────┬──────────────┐ │Sign (1) │ Exponent (8) │ Mantissa (7) │ └─────────┴──────────────┴──────────────┘ 15 14 7 6 0 S: Sign bit (0 = positive, 1 = negative) E: Exponent bits (8 bits, bias = 127) M: Mantissa/fraction bits (7 bits) ``` For regular numbers that are not special cases (like zero, infinity, NaN or Denormals), their value is given by the formula: $$ v = (-1)^S \times 2^{E-127} \times \left(1 + \frac{M}{128}\right) $$ bfloat16 preserves the wide numeric range of float32 by keeping the same 8-bit exponent, but it reduces precision because its mantissa has only 7 bits instead of 23. ### Special Cases :::spoiler Assembly Code ```c= .text .globl bf16_isnan bf16_isnan: # a0: bf16 bits, return 1 if NaN, else 0 li t0, 0x7FFF # clear sign and t1, a0, t0 # t1 = |x| li t2, 0x7F80 # |Inf| sltu a0, t2, t1 # a0 = (|x| > |Inf|) ret .globl bf16_isinf bf16_isinf: # a0: bf16 bits, return 1 if ±Inf, else 0 li t0, 0x7FFF and t1, a0, t0 # t1 = |x| li t2, 0x7F80 # |Inf| xor t1, t1, t2 seqz a0, t1 # a0 = 1 if equal ret .globl bf16_iszero bf16_iszero: # a0: bf16 bits, return 1 if ±0, else 0 li t0, 0x7FFF and t1, a0, t0 # drop sign seqz a0, t1 ret .globl f32_to_bf16 f32_to_bf16: # a0: f32 bits, return bf16 bits in a0 srli t0, a0, 23 andi t0, t0, 0xFF # exponent addi t0, t0, -255 # exponent - 0xFF bnez t0, f32_unspecial # exponent == 0xFF: Inf or NaN, just truncate srli a0, a0, 16 ret f32_unspecial: # round to nearest even when truncating to bf16 srli t0, a0, 16 andi t0, t0, 1 # bit16 (LSB of discarded part) li t1, 0x7FFF add t0, t0, t1 # 0x7FFF or 0x8000 add a0, a0, t0 srli a0, a0, 16 ret .globl bf16_to_f32 bf16_to_f32: # a0: bf16 bits, return f32 bits slli a0, a0, 16 ret .globl BF16_NAN BF16_NAN: # return quiet NaN (bf16) li a0, 0x7FC0 ret .globl BF16_ZERO BF16_ZERO: # return +0 (bf16) li a0, 0x0000 ret ``` ::: ### Implementation Addition #### Test Case | Test # | Operation | Input A | Input B | BF16 A | BF16 B | Expected Output C | BF16 C | |--------|-----------|---------|---------|--------|--------|--------|--------| | 1 | Add | 1.0 | 2.0 | 0x3F80 | 0x4000 | 3.0 | 0x4040 | | 2 | Add | -1.5 | 0.5 | 0xBFC0 | 0x3F00 | -1.0 | 0xBF80 | | 3 | Add | -Inf | 5.0 | 0xFF80 | 0x40A0 | -Inf | 0xFF80 | :::spoiler Assembly Code ```c= .data newline: .string "\n" pass_msg: .asciz "Test Passed\n" fail_msg: .asciz "Test Failed\n" .text .globl main main: addi sp, sp, -4 sw ra, 0(sp) test1: # Test 1.0 + 2.0 = 3.0 li a0, 0x3F80 # Input A = 1.0 li a1, 0x4000 # Input B = 2.0 jal ra, bf16_add # Output C = A + B li t1, 0x4040 # Expected Output C = 3.0 bne a0, t1, test1_fail jal ra, print_pass j test2 test1_fail: jal ra, print_fail j test2 test2: # Test -1.5 + 0.5 = -1.0 li a0, 0xBFC0 # Input A = -1.5 li a1, 0x3F00 # Input B = 0.5 jal ra, bf16_add # Output C = A + B li t1, 0xBF80 # Expected Output C = -1.0 bne a0, t1, test2_fail jal ra, print_pass j test3 test2_fail: jal ra, print_fail j test3 test3: # Test -Inf + 5.0 = -Inf li a0, 0xFF80 # Input A = -Inf li a1, 0x40A0 # Input B = 5.0 jal ra, bf16_add # Output C = A + B li t1, 0xFF80 # Expected Output C = -Inf bne a0, t1, test3_fail jal ra, print_pass j tests_done test3_fail: jal ra, print_fail j tests_done print_pass: la a0, pass_msg li a7, 4 # syscall: print string ecall jr ra print_fail: la a0, fail_msg li a7, 4 # syscall: print string ecall jr ra tests_done: lw ra, 0(sp) addi sp, sp, 4 li a7, 10 # syscall: exit ecall .globl bf16_add bf16_add: # extract sign, exponent, mantissa srli t0, a0, 15 # t0 = sign_a (bit 15) srli t1, a1, 15 # t1 = sign_b srli t2, a0, 7 andi t2, t2, 0xFF # t2 = exp_a (8 bits) srli t3, a1, 7 andi t3, t3, 0xFF # t3 = exp_b andi t4, a0, 0x7F # t4 = mant_a (7 bits) andi t5, a1, 0x7F # t5 = mant_b (7 bits) li t6, 0xFF bne t2, t6, check_exp_b exp_a_checkall: bnez t4, ret_a # mant_a != 0 → a is NaN, return a bne t3, t6, ret_a # a is Inf, b is finite → return a bnez t5, return_b1 # b mantissa != 0 → b is NaN bne t0, t1, return_nan # +Inf + -Inf → NaN return_b1: mv a0, a1 # b is NaN or same-sign Inf ret return_nan: li a0, 0x7FC0 # canonical NaN ret_a: ret check_exp_b: beq t3, t6, return_b2 j check_0_a return_b2: mv a0, a1 # b is NaN or Inf ret check_0_a: bnez t2, check_0_b # exp_a != 0 → not zero bnez t4, check_0_b # mant_a != 0 → not zero mv a0, a1 # a is ±0 → result = b ret check_0_b: bnez t3, norm_a bnez t5, norm_a ret # b is ±0 → result = a (a0) norm_a: beqz t2, norm_b # exp_a == 0 → subnormal ori t4, t4, 0x80 # mant_a |= 1 << 7 (restore hidden bit) norm_b: beqz t3, end_check1 ori t5, t5, 0x80 # mant_b |= 1 << 7 (restore hidden bit) end_check1: addi sp, sp, -20 sw s0, 16(sp) sw s1, 12(sp) sw s2, 8(sp) sw s3, 4(sp) sw s4, 0(sp) sub s0, t2, t3 # s0 = exp_diff = exp_a - exp_b blez s0, diff_neg # exp_a <= exp_b mv s2, t2 # result_exp = exp_a li t6, 8 bgt s0, t6, return_a # if exp_diff > 8 → B too small srl t5, t5, s0 # shift mant_b j exp_done diff_neg: bgez s0, diff_else # exp_diff == 0 mv s2, t3 # result_exp = exp_b li t6, -8 bge s0, t6, shift_a # if exp_diff >= -8 → shift A j return_b3 # else A too small → result ≈ B shift_a: neg s4, s0 # s4 = -exp_diff srl t4, t4, s4 # shift mant_a j exp_done diff_else: # exp_diff == 0 mv s2, t2 j exp_done return_a: # a0 is already A → return A directly j bf16_epilogue return_b3: # result = B mv a0, a1 j bf16_epilogue exp_done: bne t0, t1, diff_sign # sign differ → subtraction same_sign: mv s1, t0 # result_sign add s3, t4, t5 # result_mant = mant_a + mant_b andi t6, s3, 0x100 # overflow into bit 8? beqz t6, norm_end srli s3, s3, 1 # shift mantissa right addi s2, s2, 1 # exponent++ li t6, 0xFF bge s2, t6, overflow_inf j norm_end overflow_inf: # a0 = ±Inf slli a0, s1, 15 # sign li t6, 0x7F80 # Inf exponent (exp=0xFF, mant=0) or a0, a0, t6 j bf16_epilogue diff_sign: bge t4, t5, manta_gt_b mv s1, t1 # result_sign = sign_b sub s3, t5, t4 # mant_b - mant_a j mant_result manta_gt_b: mv s1, t0 # result_sign = sign_a sub s3, t4, t5 # mant_a - mant_b mant_result: beqz s3, return_zero # exact zero norm_loop: andi t6, s3, 0x80 bnez t6, norm_end slli s3, s3, 1 addi s2, s2, -1 blez s2, return_zero j norm_loop norm_end: # reconstruct BF16: sign | exponent | mantissa slli a0, s1, 15 # sign andi t0, s2, 0xFF slli t0, t0, 7 # exponent or a0, a0, t0 andi t0, s3, 0x7F # mantissa or a0, a0, t0 j bf16_epilogue return_zero: li a0, 0x0000 # +0 bf16_epilogue: lw s0, 16(sp) lw s1, 12(sp) lw s2, 8(sp) lw s3, 4(sp) lw s4, 0(sp) addi sp, sp, 20 ret ``` ::: #### Result ![螢幕擷取畫面 2025-12-08 194941](https://hackmd.io/_uploads/BkOdo44zbl.png) ### Implementation Subtraction #### Test Case | Test # | Operation | Input A | Input B | BF16 A | BF16 B | Expected Output C | BF16 C | |--------|-----------|---------|---------|--------|--------|--------|--------| | 1 | Sub | 3.0 | 1.0 | 0x4040 | 0x3F80 | 2.0 | 0x4000 | | 2 | Sub | 1.5 | 2.0 | 0x3FC0 | 0x4000 | -0.5 | 0xBF00 | | 3 | Sub | +Inf | 2.5 | 0x7F80 | 0x4020 | +Inf | 0x7F80 | :::spoiler Assembly Code ```c= .data newline: .string "\n" pass_msg: .asciz "Test Passed\n" fail_msg: .asciz "Test Failed\n" .text .globl main main: addi sp, sp, -4 sw ra, 0(sp) test1: # Test 3.0 - 1.0 = 2.0 li a0, 0x4040 # Input A = 3.0 li a1, 0x3F80 # Input B = 1.0 jal ra, bf16_sub # Output C = A - B li t1, 0x4000 # Expected Output C = 2.0 bne a0, t1, test1_fail jal ra, print_pass j test2 test1_fail: jal ra, print_fail j test2 test2: # Test 1.5 - 2.0 = -0.5 li a0, 0x3FC0 # Input A = 1.5 li a1, 0x4000 # Input B = 2.0 jal ra, bf16_sub # Output C = A - B li t1, 0xBF00 # Expected Output C = -0.5 bne a0, t1, test2_fail jal ra, print_pass j test3 test2_fail: jal ra, print_fail j test3 test3: # Test +Inf - 2.5 = +Inf li a0, 0x7F80 # Input A = +Inf li a1, 0x4020 # Input B = 2.5 jal ra, bf16_sub # Output C = A - B li t1, 0x7F80 # Expected Output C = +Inf bne a0, t1, test3_fail jal ra, print_pass j tests_done test3_fail: jal ra, print_fail j tests_done print_pass: la a0, pass_msg li a7, 4 # syscall: print string ecall jr ra print_fail: la a0, fail_msg li a7, 4 # syscall: print string ecall jr ra tests_done: lw ra, 0(sp) addi sp, sp, 4 li a7, 10 # syscall: exit ecall # BF16 subtraction wrapper: C = A - B # Implemented as: A + (-B), reusing bf16_add .globl bf16_sub bf16_sub: li t6, 0x8000 # mask for sign bit (bit 15) xor a1, a1, t6 # flip sign of B → B = -B j bf16_add # tail call bf16_add (A + (-B)) # Original BF16 addition: C = A + B .globl bf16_add bf16_add: # extract sign, exponent, mantissa srli t0, a0, 15 # t0 = sign_a (bit 15) srli t1, a1, 15 # t1 = sign_b srli t2, a0, 7 andi t2, t2, 0xFF # t2 = exp_a (8 bits) srli t3, a1, 7 andi t3, t3, 0xFF # t3 = exp_b andi t4, a0, 0x7F # t4 = mant_a (7 bits) andi t5, a1, 0x7F # t5 = mant_b (7 bits) li t6, 0xFF bne t2, t6, check_exp_b exp_a_checkall: bnez t4, ret_a # mant_a != 0 → a is NaN, return a bne t3, t6, ret_a # a is Inf, b is finite → return a bnez t5, return_b1 # b mantissa != 0 → b is NaN bne t0, t1, return_nan # +Inf + -Inf → NaN return_b1: mv a0, a1 # b is NaN or same-sign Inf ret return_nan: li a0, 0x7FC0 # canonical NaN ret_a: ret check_exp_b: beq t3, t6, return_b2 j check_0_a return_b2: mv a0, a1 # b is NaN or Inf ret check_0_a: bnez t2, check_0_b # exp_a != 0 → not zero bnez t4, check_0_b # mant_a != 0 → not zero mv a0, a1 # a is ±0 → result = b ret check_0_b: bnez t3, norm_a bnez t5, norm_a ret # b is ±0 → result = a (a0) norm_a: beqz t2, norm_b # exp_a == 0 → subnormal ori t4, t4, 0x80 # mant_a |= 1 << 7 norm_b: beqz t3, end_check1 ori t5, t5, 0x80 end_check1: addi sp, sp, -20 sw s0, 16(sp) sw s1, 12(sp) sw s2, 8(sp) sw s3, 4(sp) sw s4, 0(sp) sub s0, t2, t3 # s0 = exp_diff = exp_a - exp_b blez s0, diff_neg # exp_a <= exp_b mv s2, t2 # result_exp = exp_a li t6, 8 bgt s0, t6, return_a # if exp_diff > 8 → B too small srl t5, t5, s0 # shift mant_b j exp_done diff_neg: bgez s0, diff_else # exp_diff == 0 mv s2, t3 # result_exp = exp_b li t6, -8 bge s0, t6, shift_a # if exp_diff >= -8 → shift A j return_b3 # else A too small → result ≈ B shift_a: neg s4, s0 # s4 = -exp_diff srl t4, t4, s4 # shift mant_a j exp_done diff_else: # exp_diff == 0 mv s2, t2 j exp_done return_a: # a0 is already A → return A directly j bf16_epilogue return_b3: # result = B mv a0, a1 j bf16_epilogue exp_done: bne t0, t1, diff_sign # sign differ → subtraction same_sign: mv s1, t0 # result_sign add s3, t4, t5 # result_mant andi t6, s3, 0x100 # overflow into bit 8? beqz t6, norm_end srli s3, s3, 1 # shift mantissa addi s2, s2, 1 # exponent++ li t6, 0xFF bge s2, t6, overflow_inf j norm_end overflow_inf: # a0 = ±Inf slli a0, s1, 15 # sign li t6, 0x7F80 # Inf exponent (exp=0xFF, mant=0) or a0, a0, t6 j bf16_epilogue diff_sign: bge t4, t5, manta_gt_b mv s1, t1 # result_sign = sign_b sub s3, t5, t4 # mant_b - mant_a j mant_result manta_gt_b: mv s1, t0 # result_sign = sign_a sub s3, t4, t5 # mant_a - mant_b mant_result: beqz s3, return_zero # exact zero norm_loop: andi t6, s3, 0x80 bnez t6, norm_end slli s3, s3, 1 addi s2, s2, -1 blez s2, return_zero j norm_loop norm_end: # reconstruct BF16: sign | exponent | mantissa slli a0, s1, 15 # sign andi t0, s2, 0xFF slli t0, t0, 7 # exponent or a0, a0, t0 andi t0, s3, 0x7F # mantissa or a0, a0, t0 j bf16_epilogue return_zero: li a0, 0x0000 # +0 bf16_epilogue: lw s0, 16(sp) lw s1, 12(sp) lw s2, 8(sp) lw s3, 4(sp) lw s4, 0(sp) addi sp, sp, 20 ret ``` ::: #### Result ![螢幕擷取畫面 2025-12-08 195706](https://hackmd.io/_uploads/rJoXTNVGbg.png) ### Implementation Multiplication #### Test Case | Test # | Operation | Input A | Input B | BF16 A | BF16 B | Expected Output C | BF16 C | |--------|-----------|---------|---------|--------|--------|--------|--------| | 1 | Mul | 3.0 | 2.5 | 0x4040 | 0x4020 | 7.5 | 0x40F0 | | 2 | Mul | 0.0 | +Inf | 0x0000 | 0x7F80 | NAN | 0x7FC0 | | 3 | Mul | 2.0 | -1.5 | 0x4000 | 0xBFC0 | -3.0 | 0xC040 | :::spoiler Assembly Code ```c= .globl main .data pass_msg: .asciz "PASS\n" fail_msg: .asciz "FAIL\n" .text main: addi sp, sp, -4 sw ra, 0(sp) # Test 1: 3.0 × 2.5 = 7.5 test1: li a0, 0x4040 # Input A = 3.0 li a1, 0x4020 # Input B = 2.5 jal ra, bf16_mul # Output C = A * B li t1, 0x40F0 # Expected Output C = 7.5 bne a0, t1, fail1 jal ra, print_pass j test2 fail1: jal ra, print_fail # Test 2: 0.0 × Inf = NaN test2: li a0, 0x0000 # Input A = 0.0 li a1, 0x7F80 # Input B = +Inf jal ra, bf16_mul # Output C = A * B li t1, 0x7FC0 # Expected Output C = NAN bne a0, t1, fail2 jal ra, print_pass j test3 fail2: jal ra, print_fail # Test 3: 2.0 × -1.5 = -3.0 test3: li a0, 0x4000 # Input A = 2.0 li a1, 0xBFC0 # Input B = -1.5 jal ra, bf16_mul # Output C = A * B li t1, 0xC040 # Expected Output C = -3.0 bne a0, t1, fail3 jal ra, print_pass j done fail3: jal ra, print_fail print_pass: la a0, pass_msg li a7, 4 ecall ret print_fail: la a0, fail_msg li a7, 4 ecall ret done: lw ra, 0(sp) addi sp, sp, 4 li a7, 10 ecall .globl bf16_mul bf16_mul: addi sp, sp, -16 sw s0, 0(sp) sw s1, 4(sp) sw s2, 8(sp) sw ra, 12(sp) # Extract sign, exponent, mantissa srli t0, a0, 15 andi t0, t0, 1 srli t1, a1, 15 andi t1, t1, 1 xor s0, t0, t1 # result sign srli t2, a0, 7 # exp A andi t2, t2, 0xFF srli t3, a1, 7 # exp B andi t3, t3, 0xFF andi t4, a0, 0x7F # mant A andi t5, a1, 0x7F # mant B # Step 1 — NaN Handling (highest priority) li t6, 0xFF # A is NaN? beq t2, t6, check_nan_a # B is NaN? beq t3, t6, check_nan_b j check_inf # no NaN → go Inf check check_nan_a: bnez t4, make_nan j check_inf # A=Inf but not NaN check_nan_b: bnez t5, make_nan j check_inf # Step 2 — Inf Handling check_inf: li t6, 0xFF # A = Inf beq t2, t6, handle_inf_a # B = Inf beq t3, t6, handle_inf_b j check_zero handle_inf_a: # A = Inf beqz t3, make_nan # Inf × 0 = NaN j make_inf handle_inf_b: # B = Inf beqz t2, make_nan # 0 × Inf = NaN j make_inf # Step 3 — Zero Handling check_zero: # A == 0 → 0 beqz t2, check_a_mant j check_b_zero check_a_mant: beqz t4, return_zero check_b_zero: # B == 0 → 0 beqz t3, check_b_mant j normalize_inputs check_b_mant: beqz t5, return_zero # NaN / Inf / Zero Makers make_nan: li a0, 0x7FC0 j mul_done make_inf: slli a0, s0, 15 li t6, 0x7F80 or a0, a0, t6 j mul_done return_zero: slli a0, s0, 15 j mul_done # Normalize A & B normalize_inputs: mv s1, zero # exp_adjust = 0 # Normalize A beqz t2, norm_a_sub ori t4, t4, 0x80 j norm_b norm_a_sub: beqz t4, return_zero norm_a_loop: andi t6, t4, 0x80 bnez t6, norm_a_end slli t4, t4, 1 addi s1, s1, -1 j norm_a_loop norm_a_end: li t2, 1 # Normalize B norm_b: beqz t3, norm_b_sub ori t5, t5, 0x80 j do_mul norm_b_sub: beqz t5, return_zero norm_b_loop: andi t6, t5, 0x80 bnez t6, norm_b_end slli t5, t5, 1 addi s1, s1, -1 j norm_b_loop norm_b_end: li t3, 1 # Mantissa Multiply do_mul: mul s2, t4, t5 add t6, t2, t3 addi t6, t6, -127 add t6, t6, s1 mv s1, t6 # Normalize multiplication result li t6, 0x8000 and t6, s2, t6 beqz t6, mul_norm_low srli s2, s2, 8 andi s2, s2, 0x7F addi s1, s1, 1 j check_exp mul_norm_low: srli s2, s2, 7 andi s2, s2, 0x7F # Exponent Overflow / Underflow check_exp: li t6, 255 bge s1, t6, make_inf blez s1, return_zero # Assemble Final BF16 final_output: slli a0, s0, 15 andi s1, s1, 0xFF slli s1, s1, 7 or a0, a0, s1 or a0, a0, s2 j mul_done # Epilogue mul_done: lw s0, 0(sp) lw s1, 4(sp) lw s2, 8(sp) lw ra, 12(sp) addi sp, sp, 16 ret ``` ::: #### Result ![螢幕擷取畫面 2025-12-12 110028](https://hackmd.io/_uploads/rJmOSZtzWg.png) ### Implementation Division #### Test Case | Test # | Operation | Input A | Input B | BF16 A | BF16 B | Expected Output C | BF16 C | |--------|-----------|---------|---------|--------|--------|--------|--------| | 1 | Div | 6.0 | 2.0 | 0x40C0 | 0x4000 | 3.0 | 0x4040 | | 2 | Div | -2.0 | 4.0 | 0xC000 | 0x4080 | -0.5 | 0xBF00 | | 3 | Div | -Inf | -5.0 | 0xFF80 | 0xC0A0 | +Inf | 0x7F80 | :::spoiler Assembly Code ```c= .data pass_msg: .asciz "PASS\n" fail_msg: .asciz "FAIL\n" .text .globl main main: addi sp, sp, -4 sw ra, 0(sp) # Test 1: 6.0 / 2.0 = 3.0 test1: li a0, 0x40C0 # Input A = 6.0 li a1, 0x4000 # Input B = 2.0 jal ra, bf16_div # Output C = A / B li t1, 0x4040 # Expected Output C = 3.0 bne a0, t1, fail1 jal ra, print_pass j test2 fail1: jal ra, print_fail # Test 2: -2.0 / 4.0 = -0.5 test2: li a0, 0xC000 # Input A = -2.0 li a1, 0x4080 # Input B = 4.0 jal ra, bf16_div # Output C = A / B li t1, 0xBF00 # Expected Output C = -0.5 bne a0, t1, fail2 jal ra, print_pass j test3 fail2: jal ra, print_fail # Test 3: -Inf / -5.0 = +Inf test3: li a0, 0xFF80 # Input A = -Inf li a1, 0xC0A0 # Input B = -5.0 jal ra, bf16_div # Output C = A / B li t1, 0x7F80 # Expected Output C = +Inf bne a0, t1, fail3 jal ra, print_pass j done fail3: jal ra, print_fail done: lw ra, 0(sp) addi sp, sp, 4 li a7, 10 ecall print_pass: la a0, pass_msg li a7, 4 ecall jr ra print_fail: la a0, fail_msg li a7, 4 ecall jr ra .globl bf16_div bf16_div: # Extract sign, exponent, mantissa srli t0, a0, 15 # sign_a srli t1, a1, 15 # sign_b xor a2, t0, t1 # result_sign = a.sign ^ b.sign srli t2, a0, 7 # exp_a andi t2, t2, 0xFF srli t3, a1, 7 # exp_b andi t3, t3, 0xFF andi t4, a0, 0x7F # mant_a andi t5, a1, 0x7F # mant_b # Special cases : NaN, Inf, Zero li t6, 0xFF # B is NaN or Inf beq t3, t6, check_B_inf_nan # B is Zero → A/0 = Inf or -Inf beq t3, x0, check_B_zero j check_A_inf_nan # B is NaN or Inf check_B_inf_nan: bnez t5, make_nan # B mantissa != 0 → NaN # B = Inf beq t2, t6, make_nan # Inf / Inf = NaN # finite / Inf = Zero j make_zero # B == Zero check_B_zero: beq t5, x0, make_inf # A / 0 → ±Inf j continue # subnormal zero → go normalize # A is NaN or Inf check_A_inf_nan: beq t2, t6, A_inf_or_nan j continue A_inf_or_nan: bnez t4, make_nan # NaN # A is Inf j make_inf # Continue : Normalize A & B mantissas continue: # Normalize A li s1, 0 # exp_adjust = 0 beq t2, x0, normA_sub ori t4, t4, 0x80 # add hidden 1 j normB normA_sub: beq t4, x0, make_zero # A = 0 normA_loop: andi t6, t4, 0x80 bnez t6, normA_done slli t4, t4, 1 addi s1, s1, -1 j normA_loop normA_done: li t2, 1 # Normalize B normB: beq t3, x0, normB_sub ori t5, t5, 0x80 j divide_start normB_sub: beq t5, x0, make_zero # B=0 case already handled above normB_loop: andi t6, t5, 0x80 bnez t6, normB_done slli t5, t5, 1 addi s1, s1, -1 j normB_loop normB_done: li t3, 1 # Long Division 16-bit divide_start: slli a4, t4, 15 # dividend <<= 15 li a5, 0 # quotient = 0 mv t6, t5 # divisor working copy slli t6, t6, 15 li t0, 16 div_loop: slli a5, a5, 1 # shift quotient left sltu t1, a4, t6 # if dividend < divisor bne t1, x0, div_skip sub a4, a4, t6 # dividend -= divisor ori a5, a5, 1 # quotient bit = 1 div_skip: srli t6, t6, 1 # divisor >>= 1 addi t0, t0, -1 bne t0, x0, div_loop # Compute exponent sub a3, t2, t3 addi a3, a3, 127 beq t2, x0, decA j adjustB decA: addi a3, a3, -1 adjustB: beq t3, x0, incB j normalize_q incB: addi a3, a3, 1 # Normalize quotient (mantissa) normalize_q: li t0, 0x8000 and t1, a5, t0 bnez t1, shift8 # Shift left until top bit=1 norm_q_loop: and t1, a5, t0 bnez t1, shift8 slli a5, a5, 1 addi a3, a3, -1 j norm_q_loop shift8: srli a5, a5, 8 # Assemble final BF16 andi a5, a5, 0x7F li t0, 255 bge a3, t0, make_inf ble a3, x0, make_zero andi t0, a3, 255 slli t0, t0, 7 slli a0, a2, 15 or a0, a0, t0 or a0, a0, a5 jr ra # Special return paths make_zero: slli a0, a2, 15 jr ra make_inf: slli a0, a2, 15 li t0, 0x7F80 or a0, a0, t0 jr ra make_nan: li a0, 0x7FC0 jr ra ``` ::: #### Result ![螢幕擷取畫面 2025-12-13 164228](https://hackmd.io/_uploads/r1ZXvi9GZl.png) ### Implementation Square Root #### Test Case | Test # | Operation | Input A | Input B | BF16 A | BF16 B | Expected Output C | BF16 C | |--------|-----------|---------|---------|--------|--------|--------|--------| | 1 | Sqrt | 4.0 | | 0x4080 | | 2.0 | 0x4000 | | 2 | Sqrt | +Inf | | 0x7F80 | | +Inf | 0x7F80 | | 3 | Sqrt | +0.0 | | 0x0000 | | +0.0 | 0x0000 | :::spoiler Assembly Code ```c= .data pass_msg: .asciz "Test Passed\n" fail_msg: .asciz "Test Failed\n" .text .globl main .globl bf16_sqrt main: addi sp, sp, -4 sw ra, 0(sp) test1: li a0, 0x4080 # Input A = 4.0 jal ra, bf16_sqrt # Output C = sqrt(A) li t1, 0x4000 # Expected Output C = 2.0 bne a0, t1, test1_fail jal ra, print_pass j test2 test1_fail: jal ra, print_fail j test2 test2: li a0, 0x7F80 # Input A = +Inf jal ra, bf16_sqrt # Output C = sqrt(A) li t1, 0x7F80 # Expected Output C = +Inf bne a0, t1, test2_fail jal ra, print_pass j test3 test2_fail: jal ra, print_fail j test3 test3: li a0, 0x0000 # Input A = +0.0 jal ra, bf16_sqrt # Output C = sqrt(A) li t1, 0x0000 # Expected Output C = +0.0 bne a0, t1, test3_fail jal ra, print_pass j program_end test3_fail: jal ra, print_fail j program_end program_end: lw ra, 0(sp) addi sp, sp, 4 li a7, 10 # syscall: exit (RARS) ecall print_pass: la a0, pass_msg li a7, 4 # print_string ecall ret print_fail: la a0, fail_msg li a7, 4 # print_string ecall ret bf16_sqrt: addi sp, sp, -32 sw ra, 28(sp) sw s0, 24(sp) sw s1, 20(sp) sw s2, 16(sp) sw s3, 12(sp) sw s4, 8(sp) sw s5, 4(sp) sw s6, 0(sp) srli t0, a0, 15 # Shift right 15 andi t0, t0, 1 # Extract sign bit srli t1, a0, 7 # Shift right 7 andi t1, t1, 0xFF # Extract exponent (8 bits) andi t2, a0, 0x7F # Extract mantissa (7 bits) li t3, 0xFF bne t1, t3, check_zero # if exp != 0xFF → not Inf/NaN bnez t2, return_a # exp=0xFF, mant!=0 → NaN, just return a (NaN propagation) bnez t0, return_nan # exp=0xFF, mant=0, sign!=0 → -Inf → NaN j return_a # exp=0xFF, mant=0, sign=0 → +Inf, return a check_zero: # Handle zero or t3, t1, t2 # if exp==0 && mant==0 → zero bnez t3, check_negative j return_zero check_negative: # Handle negative / denormals bnez t0, return_nan # negative number → NaN bnez t1, compute_sqrt # if exp!=0 → normalized → go sqrt j return_zero # denormals are flushed to zero compute_sqrt: addi s0, t1, -127 # s0 = unbiased exponent E ori s1, t2, 0x80 # s1 = mantissa with implicit leading 1 (1.xxx) andi t3, s0, 1 # t3 = E & 1 (check odd/even) beqz t3, even_exp # odd exponent: sqrt(2^E * M) = 2^{(E-1)/2} * sqrt(2*M) slli s1, s1, 1 # mantissa * 2 addi t4, s0, -1 srai t4, t4, 1 addi s2, t4, 127 # s2 = output exponent (biased) j binary_search even_exp: # even exponent: sqrt(2^E * M) = 2^{E/2} * sqrt(M) srai t4, s0, 1 addi s2, t4, 127 # s2 = output exponent (biased) binary_search: # Search integer y in [90,256] such that (y^2 >> 7) ≈ s1 li s3, 90 # low li s4, 256 # high li s5, 128 # best (initial guess) search_loop: bgt s3, s4, search_done add t3, s3, s4 srli t3, t3, 1 # mid = (low + high) / 2 mv a1, t3 mv a2, t3 jal ra, multiply # a0 = mid * mid mv t4, a0 srli t4, t4, 7 # compare (mid^2 >> 7) with s1 bgt t4, s1, search_high # too large → move high mv s5, t3 # accept mid as current best addi s3, t3, 1 # low = mid + 1 j search_loop search_high: addi s4, t3, -1 # high = mid - 1 j search_loop search_done: li t3, 256 blt s5, t3, check_low # if best < 256 → normal case srli s5, s5, 1 # if best >= 256 → renormalize addi s2, s2, 1 j extract_mant check_low: li t3, 128 bge s5, t3, extract_mant # already normalized (>=128) norm_loop: # ensure mantissa in [128,255] li t3, 128 bge s5, t3, extract_mant li t3, 1 ble s2, t3, extract_mant # avoid exponent underflow slli s5, s5, 1 addi s2, s2, -1 j norm_loop extract_mant: andi s6, s5, 0x7F # take low 7 bits as mantissa li t3, 0xFF bge s2, t3, return_inf # exponent overflow → +Inf blez s2, return_zero # exponent underflow / zero andi t3, s2, 0xFF slli t3, t3, 7 or a0, t3, s6 # pack exponent + mantissa (sign=0) j cleanup return_zero: li a0, 0x0000 j cleanup return_nan: li a0, 0x7FC0 # canonical quiet NaN j cleanup return_inf: li a0, 0x7F80 # +Inf j cleanup return_a: # a0 already has original input (Inf / NaN propagation etc.) cleanup: lw s6, 0(sp) lw s5, 4(sp) lw s4, 8(sp) lw s3, 12(sp) lw s2, 16(sp) lw s1, 20(sp) lw s0, 24(sp) lw ra, 28(sp) addi sp, sp, 32 ret # multiply(a1, a2) → a0 = a1 * a2 (unsigned, shift-add) multiply: li a0, 0 beqz a2, mult_done mult_loop: andi t0, a2, 1 beqz t0, mult_skip add a0, a0, a1 mult_skip: slli a1, a1, 1 srli a2, a2, 1 bnez a2, mult_loop mult_done: ret ``` ::: #### Result ![螢幕擷取畫面 2025-12-12 110425](https://hackmd.io/_uploads/ByvILWKMbg.png) ## 5-Stage Pipeline ### Introduction Modern RISC processors use a **five-stage pipeline** to execute instructions, the following figure is an example of a five-stage RISC-V pipeline. It splits executing one instruction into 5 consecutive stages so that multiple instructions can be in different stages at the same time, improving overall throughput. In the ideal case, the pipeline can complete nearly one instruction per cycle. ![螢幕擷取畫面 2025-12-13 165214](https://hackmd.io/_uploads/H1Z16o5zbe.png) 1️⃣ **IF – Instruction Fetch** - Use the current PC (Program Counter) to read the instruction from instruction memory. - Store the instruction into the IF/ID pipeline register and compute the next PC. 2️⃣ **ID – Instruction Decode** - Decode the instruction type (R-type, I-type, load/store, branch, …). - Read the source registers rs1 and rs2 from the register file. - Generate control signals such as RegWrite, MemRead, MemWrite, ALUOp, and ALUSrc. - For I-type and load/store instructions, generate a sign-extended immediate. 3️⃣ **EX – Execute** - The ALU performs operations such as addition/subtraction, shift, and comparison according to the control signals. - For load/store instructions, compute the memory address in this stage (base register + offset). - For branch instructions, compare rs1 and rs2 here to decide whether to branch, and produce the branch target address. 4️⃣ **MEM – Memory Access** - For **read** instructions such as `lw` / `lh` / `lb`: use the address computed in EX to read from data memory in this stage. - For **write** instructions such as `sw` / `sh` / `sb`: write the data from a register into data memory in this stage. - Instructions that do not access memory simply pass the ALU result to the next pipeline register in this stage. 5️⃣ **WB – Write Back** - Write the ALU result or data read from memory back to the destination register rd. - Whether the processor writes back is controlled by the RegWrite signal. The write-back data source is selected by a MUX, choosing between the ALU result and the data read from memory. ### How instructions are executed in the Ripes simulator `addi sp, sp, 32` means adding 32 to sp (stack pointer) and writing the result back to sp. In the context of a function’s stack frame, this typically releases 32 bytes of stack space by moving the stack pointer back to its original position, reclaiming the temporary space allocated on the stack. <span style="background-color:#6f42c1; color:#ffffff; font-weight:bold; padding:2px 6px; border-radius:4px;">Stage 1 - IF</span> ![螢幕擷取畫面 2025-12-14 005503](https://hackmd.io/_uploads/S166czoGWl.png) - In the figure, the instruction memory address is 0x00000000, meaning the processor uses PC = `0x00000000` to fetch the instruction from instruction memory. - The MUX determines whether the PC should be incremented by 4 or 2. Since the fetched addi is a 32-bit instruction, the MUX selects +4, so the sequential next PC candidate is `0x00000000 + 0x00000004 = 0x00000004`. - Because `addi` does not change control flow, the processor follows the sequential path instead of a branch or jump target. As a result, the next PC is 0x00000004, the same as the sequential next PC candidate. - `0x02010113` is the 32-bit machine code of a RISC-V instruction. When decoded, it corresponds to `addi x2, x2, 32`. In RISC-V, x2 is the sp, so this is also written as `addi sp, sp, 32`. <span style="background-color:#6f42c1; color:#ffffff; font-weight:bold; padding:2px 6px; border-radius:4px;">Stage 2 - ID</span> ![螢幕擷取畫面 2025-12-14 140717](https://hackmd.io/_uploads/r15N4RoGWe.png) | Field | Bit range | Value (bin) | Value (hex) | Meaning | |----------|-----------|----------------|-----------------|---------| | imm[11:0] | [31:20] | 000000100000 | 0x020 | immediate = 32 | | rs1 | [19:15] | 00010 | 0x02 | rs1 = x2 (sp) | | funct3 | [14:12] | 000 | 0x0 | funct3 = 000 → ADDI | | rd | [11:7] | 00010 | 0x02 | rd = x2 (sp) | | opcode | [6:0] | 0010011 | 0x13 | OP-IMM (addi/andi/ori/...) | 1. Decode - After the machine code `0x02010113` enters the Decode block, it is recognized as opcode = ADDI. - This means the instruction is an I-type add-immediate instruction. 2. Registers - In the Registers block: - `1 idx = 0x02` indicates that rs1 reads x2. - `2 idx = 0x00` indicates that the other read port reads x0. `addi` does not actually require rs2, but the hardware typically still outputs a value for this port. - The values read from the register file are: - `Reg1 = 0x7ffffff0`: the current value of x2. - `Reg2 = 0x00000000`: read from x0, so it is 0. 3. Imm (Immediate Generator) - For `addi`, the immediate is 32. After sign extension, it is shown as `0x00000020`. 4. ID/EX Pipeline Register - The ID/EX pipeline register latches the values prepared in the ID stage and forwards them to the EX stage: - Reg1: `0x7ffffff0` - Reg2: `0x00000000` - Imm : `0x00000020` - This allows the EX stage to use the ALU to compute x2 + 32. <span style="background-color:#6f42c1; color:#ffffff; font-weight:bold; padding:2px 6px; border-radius:4px;">Stage 3 - EX</span> ![螢幕擷取畫面 2025-12-14 164555](https://hackmd.io/_uploads/ByYIKgnz-g.png) 1) Values coming from the ID/EX (IDEX) register - `0x7ffffff0`: Reg1, which is the current value of x2. - `0x00000020`: Imm, the immediate value 32. - `0x00000000`: Reg2. This `addi` does not use rs2, so this value is typically 0. - `0x00000000` / `0x00000004`: PC and PC+4. These are mainly used for branch/jump address calculations and are not used by this `addi`. - `0x02` represents register number 2, which is x2. 2) How the ALU operands are selected - Operand 1 (upper ALU input) selects Reg1 = `0x7ffffff0`. - Operand 2 (under ALU input)is selected by a MUX. For `addi`, the control signal selects the immediate (Imm) instead of Reg2, so Operand 2 = `0x00000020`. 3) What operation the ALU performs - `addi` uses the ALU to perform addition: `0x7ffffff0 + 0x00000020 = 0x80000010`. 4) Why the Branch block indicates not taken - `addi` is not a branch or jump instruction, so no control-flow change occurs. - Therefore, **Branch taken = 0**, and the PC continues along the sequential path (PC+4). 5) What is latched into the EX/MEM pipeline register - The EX/MEM pipeline register latches the ALU result `0x80000010` and forwards it to the next stage. - It also carries the destination register information (rd = x2) and the required control signals for the following stages. <span style="background-color:#6f42c1; color:#ffffff; font-weight:bold; padding:2px 6px; border-radius:4px;">Stage 4 - MEM</span> ![螢幕擷取畫面 2025-12-14 165315](https://hackmd.io/_uploads/Sk7GighfWe.png) This figure shows the Memory Access stage while executing `addi x2, x2, 32`. Since `addi` does not access data memory, the memory hardware is idle and the datapath simply forwards values toward the WB stage. - `0x80000010` (Addr into Data memory) : this is the ALU result computed in the EX stage. For `addi`, it is the computed value to be written back to x2, not a real memory address to be used. It still appears on the address input because the datapath wiring is fixed. - Wr En is red (disabled) : this indicates **MemWrite = 0**, so no store occurs and data memory is not updated. - `Data in = 0x00000000` : this would be the data written to memory for a store instruction, but since Wr En is disabled, this value is not used here. - `Read out = 0x00000000` : this output is meaningful for load instructions. For `addi` (MemRead = 0), it typically shows a default value and does not indicate an actual memory read. - `0x02` : this is the destination register index rd = x2, carried forward so the WB stage knows where to write the result. - `0x00000004` : this is the sequential next PC related value (PC+4) that may also be carried along the pipeline, although it is not directly used by `addi` in MEM/WB. <span style="background-color:#6f42c1; color:#ffffff; font-weight:bold; padding:2px 6px; border-radius:4px;">Stage 5 - WB</span> ![螢幕擷取畫面 2025-12-14 165528](https://hackmd.io/_uploads/Bytjslhz-g.png) This figure shows the Write Back stage for the instruction `addi x2, x2, 32`. At this stage, the processor writes the computed result back to register x2. - `0x80000010` : this is the ALU result produced in the EX stage. In WB, it is used as the data to be written back to the register file. - The green MUX selection : the write-back MUX selects the source of the data written to the register file. - For arithmetic instructions like `addi`, it selects the ALU result `0x80000010`. - `0x00000000` corresponds to memory read data, which is used for load instructions and is not used by `addi` here. - `0x02` : this is the destination register index rd = x2, so the value `0x80000010` will be written into x2, which is the stack pointer. - `0x00000004` : this is the sequential next PC value carried along the pipeline. It does not affect the write-back of `addi`, but it may be useful for debugging for other instructions.