arch2025-homework1

# CA Homework 1 ## problem B I translated the C functions clz, uf8_decode, and uf8_encode into RISC-V assembly and built an automated test that exhaustively evaluates all inputs from 0 to 255. ### clz <details> <summary><b>clz</b></summary> ``` c= static inline unsigned clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return n - x; } ``` </details> ### Assembly code ``` .text .globl main main: li a0, 0x0000F000 # test 0x00F00000 jal ra, clz # call clz done: j done clz: li t0, 32 # n = 32 li t1, 16 # c = 16 clz_loop: srl t2, a0, t1 # y = x >> c beq t2, zero, clz_skip # if (!y) skip sub t0, t0, t1 # n -= c mv a0, t2 # x = y clz_skip: srli t1, t1, 1 # c >>= 1 bne t1, zero, clz_loop # while (c) sub a0, t0, a0 # return n - x ret ``` ### uf8_decode ``` .text .globl main main: # test 0x7C li a0, 0x7C jal ra, uf8_decode # a0 3568 (0xDE0) done: j done uf8_decode: andi t0, a0, 0x0f # t0 = mantissa srli t1, a0, 4 # t1 = exponent li t2, 15 sub t2, t2, t1 # t2 = 15 - exponent li t3, 0x7FFF srl t3, t3, t2 slli t3, t3, 4 # t3 = offset sll t0, t0, t1 # t0 = mantissa << exponent add a0, t0, t3 # a0 = (mantissa << exponent) + offset ret ``` ### main ``` .text .globl main main: # test function jal ra, test # a0 (1 = pass, 0 = fail)。 done: li a7, 10 ecall test: addi sp, sp, -20 sw ra, 16(sp) #return address sw s0, 12(sp) #s0 previous_value sw s1, 8(sp) #s1 passed flag sw s2, 4(sp) #s2 loop counter sw s3, 0(sp) #s3 result of decode li s0, -1 li s1, 1 mv s2, zero test_loop_start: li t0, 256 bge s2, t0, test_loop_end mv a0, s2 jal ra, uf8_decode mv s3, a0 jal ra, uf8_encode beq s2, a0, check_monotonicity j test_set_fail check_monotonicity: bgt s3, s0, update_and_continue j test_set_fail update_and_continue: mv s0, s3 addi s2, s2, 1 j test_loop_start test_set_fail: mv s1, zero j update_and_continue test_loop_end: mv a0, s1 lw s3, 0(sp) lw s2, 4(sp) lw s1, 8(sp) lw s0, 12(sp) lw ra, 16(sp) addi sp, sp, 20 ret uf8_encode: addi sp, sp, -16 sw ra, 12(sp) sw s0, 8(sp) sw s1, 4(sp) sw s2, 0(sp) li t0, 16 bge a0, t0, handle_large_value_enc j epilogue_encode handle_large_value_enc: mv s0, a0 jal ra, clz li t0, 31 sub t1, t0, a0 mv s1, zero mv s2, zero li t0, 5 blt t1, t0, find_exact_exponent addi t0, t1, -4 mv s1, t0 li t0, 15 ble s1, t0, calculate_overflow_loop mv s1, t0 calculate_overflow_loop: mv t2, zero for_loop_start: bge t2, s1, for_loop_end slli s2, s2, 1 addi s2, s2, 16 addi t2, t2, 1 j for_loop_start for_loop_end: adjust_exponent_loop: ble s1, zero, find_exact_exponent bge s0, s2, find_exact_exponent addi s2, s2, -16 srli s2, s2, 1 addi s1, s1, -1 j adjust_exponent_loop find_exact_exponent: exact_exponent_loop: li t0, 15 bge s1, t0, exact_exponent_end slli t3, s2, 1 addi t3, t3, 16 blt s0, t3, exact_exponent_end mv s2, t3 addi s1, s1, 1 j exact_exponent_loop exact_exponent_end: sub t0, s0, s2 srl t0, t0, s1 slli a0, s1, 4 or a0, a0, t0 epilogue_encode: lw s2, 0(sp) lw s1, 4(sp) lw s0, 8(sp) lw ra, 12(sp) addi sp, sp, 16 ret uf8_decode: andi t0, a0, 0x0f # t0 = mantissa = fl & 0x0f srli t1, a0, 4 # t1 = exponent = fl >> 4 li t2, 15 # t2 = 15 sub t2, t2, t1 # t2 = 15 - exponent li t3, 0x7FFF # t3 = 0x7FFF srl t3, t3, t2 # t3 = 0x7FFF >> t2 slli t3, t3, 4 # t3 = offset = (result << 4) sll t0, t0, t1 # t0 = mantissa << exponent add a0, t0, t3 # a0 = (mantissa << exponent) + offset ret clz: mv t3, a0 # t3 = x li t0, 32 # t0 = n = 32 li t1, 16 # t1 = c = 16 clz_loop: srl t2, t3, t1 # t2 = y = x >> c beq t2, zero, clz_skip # if (y == 0) then skip sub t0, t0, t1 # n -= c mv t3, t2 # x = y clz_skip: srli t1, t1, 1 # c >>= 1 bne t1, zero, clz_loop # while (c != 0) sub a0, t0, t3 # return n - x ret ``` ### Analysis Put code above into editor and we will see that Ripe doesn't execute it literally. Instead, it replace pseudo instruction into equivalent one, and change register name from ABI name to sequencial one. The translated code looks like: ``` 00000000 <main>: 0: 008000ef jal x1 8 <test> 00000004 <done>: 4: 0000006f jal x0 0 <done> 00000008 <test>: 8: fec10113 addi x2 x2 -20 c: 00112823 sw x1 16 x2 10: 00812623 sw x8 12 x2 14: 00912423 sw x9 8 x2 18: 01212223 sw x18 4 x2 1c: 01312023 sw x19 0 x2 20: fff00413 addi x8 x0 -1 24: 00100493 addi x9 x0 1 28: 00000913 addi x18 x0 0 0000002c <test_loop_start>: 2c: 10000293 addi x5 x0 256 30: 02595c63 bge x18 x5 56 <test_loop_end> 34: 00090513 addi x10 x18 0 38: 11c000ef jal x1 284 <uf8_decode> 3c: 00050993 addi x19 x10 0 40: 048000ef jal x1 72 <uf8_encode> 44: 00a90463 beq x18 x10 8 <check_monotonicity> 48: 0180006f jal x0 24 <test_set_fail> 0000004c <check_monotonicity>: 4c: 01344463 blt x8 x19 8 <update_and_continue> 50: 0100006f jal x0 16 <test_set_fail> 00000054 <update_and_continue>: 54: 00098413 addi x8 x19 0 58: 00190913 addi x18 x18 1 5c: fd1ff06f jal x0 -48 <test_loop_start> 00000060 <test_set_fail>: 60: 00000493 addi x9 x0 0 64: ff1ff06f jal x0 -16 <update_and_continue> 00000068 <test_loop_end>: 68: 00048513 addi x10 x9 0 6c: 00012983 lw x19 0 x2 70: 00412903 lw x18 4 x2 74: 00812483 lw x9 8 x2 78: 00c12403 lw x8 12 x2 7c: 01012083 lw x1 16 x2 80: 01410113 addi x2 x2 20 84: 00008067 jalr x0 x1 0 00000088 <uf8_encode>: 88: ff010113 addi x2 x2 -16 8c: 00112623 sw x1 12 x2 90: 00812423 sw x8 8 x2 94: 00912223 sw x9 4 x2 98: 01212023 sw x18 0 x2 9c: 01000293 addi x5 x0 16 a0: 00555463 bge x10 x5 8 <handle_large_value_enc> a4: 0980006f jal x0 152 <epilogue_encode> 000000a8 <handle_large_value_enc>: a8: 00050413 addi x8 x10 0 ac: 0d4000ef jal x1 212 <clz> b0: 01f00293 addi x5 x0 31 b4: 40a28333 sub x6 x5 x10 b8: 00000493 addi x9 x0 0 bc: 00000913 addi x18 x0 0 c0: 00500293 addi x5 x0 5 c4: 04534463 blt x6 x5 72 <find_exact_exponent> c8: ffc30293 addi x5 x6 -4 cc: 00028493 addi x9 x5 0 d0: 00f00293 addi x5 x0 15 d4: 0092d463 bge x5 x9 8 <calculate_overflow_loop> d8: 00028493 addi x9 x5 0 000000dc <calculate_overflow_loop>: dc: 00000393 addi x7 x0 0 000000e0 <for_loop_start>: e0: 0093da63 bge x7 x9 20 <for_loop_end> e4: 00191913 slli x18 x18 1 e8: 01090913 addi x18 x18 16 ec: 00138393 addi x7 x7 1 f0: ff1ff06f jal x0 -16 <for_loop_start> 000000f4 <for_loop_end>: f4: 00905c63 bge x0 x9 24 <find_exact_exponent> f8: 01245a63 bge x8 x18 20 <find_exact_exponent> fc: ff090913 addi x18 x18 -16 100: 00195913 srli x18 x18 1 104: fff48493 addi x9 x9 -1 108: fedff06f jal x0 -20 <for_loop_end> 0000010c <find_exact_exponent>: 10c: 00f00293 addi x5 x0 15 110: 0054de63 bge x9 x5 28 <exact_exponent_end> 114: 00191e13 slli x28 x18 1 118: 010e0e13 addi x28 x28 16 11c: 01c44863 blt x8 x28 16 <exact_exponent_end> 120: 000e0913 addi x18 x28 0 124: 00148493 addi x9 x9 1 128: fe5ff06f jal x0 -28 <find_exact_exponent> 0000012c <exact_exponent_end>: 12c: 412402b3 sub x5 x8 x18 130: 0092d2b3 srl x5 x5 x9 134: 00449513 slli x10 x9 4 138: 00556533 or x10 x10 x5 0000013c <epilogue_encode>: 13c: 00012903 lw x18 0 x2 140: 00412483 lw x9 4 x2 144: 00812403 lw x8 8 x2 148: 00c12083 lw x1 12 x2 14c: 01010113 addi x2 x2 16 150: 00008067 jalr x0 x1 0 00000154 <uf8_decode>: 154: 00f57293 andi x5 x10 15 158: 00455313 srli x6 x10 4 15c: 00f00393 addi x7 x0 15 160: 406383b3 sub x7 x7 x6 164: 00008e37 lui x28 0x8 168: fffe0e13 addi x28 x28 -1 16c: 007e5e33 srl x28 x28 x7 170: 004e1e13 slli x28 x28 4 174: 006292b3 sll x5 x5 x6 178: 01c28533 add x10 x5 x28 17c: 00008067 jalr x0 x1 0 00000180 <clz>: 180: 00050e13 addi x28 x10 0 184: 02000293 addi x5 x0 32 188: 01000313 addi x6 x0 16 0000018c <clz_loop>: 18c: 006e53b3 srl x7 x28 x6 190: 00038663 beq x7 x0 12 <clz_skip> 194: 406282b3 sub x5 x5 x6 198: 00038e13 addi x28 x7 0 0000019c <clz_skip>: 19c: 00135313 srli x6 x6 1 1a0: fe0316e3 bne x6 x0 -20 <clz_loop> 1a4: 41c28533 sub x10 x5 x28 1a8: 00008067 jalr x0 x1 0 ``` ### 5-stage pipelined processor ![image](https://hackmd.io/_uploads/HkXEw7ITxe.png) ### Instruction Fetch (IF) Fetch the next instruction at PC (e.g., jal ra, test, loop heads like test_loop_start, clz_loop). The code has many branches and jumps—bge s2, t0, test_loop_end, beq t2, zero, clz_skip, jal x0, … (unconditional jump), and jalr x0, x1, 0 (ret). IF speculatively fetches the fall-through; once EX resolves a branch/jump, any misfetched instruction is flushed (control hazard → bubbles). ### Instruction Decode & Register Fetch (ID) Examples: Loop control reads: bge s2, t0, … (reads s2, t0). Argument setup before calls: mv a0, s2 (expanded as addi a0, s2, 0) for uf8_decode/encode. Stack frame prologues: sw ra, 16(sp), sw s0, 12(sp) preserve live registers. Data hazards: Back-to-back use of values returned from functions (e.g., clz → used immediately in uf8_encode) depends on forwarding; without it, the core inserts stalls. ### Execute (EX) Representative operations clz (count leading zeros): Binary search via shifts: srl t2, t3, t1 with c stepping 16→8→4→2→1; branch on zero beq t2, zero, clz_skip; maintain n -= c; return sub a0, t0, t3. Branches resolve in EX → frequent control hazards. uf8_decode: Field split: andi t0, a0, 0x0f (mantissa), srli t1, a0, 4 (exponent). Build 0x7FFF via lui+addi, then srl/slli to form offset; compute (mantissa<<exp)+offset. uf8_encode: Estimate msb = 31 - clz(value); guess exponent ≈ msb - 4 (clamped to 15). Build overflow by loop: overflow = (overflow<<1) + 16. Adjust down while value < overflow: (overflow-16)>>1, exponent- -. Refine up while value ≥ next = (overflow<<1)+16: update overflow, exponent++. Compute mantissa = (value - overflow) >> exponent, then pack (exponent<<4)|mantissa. Branch resolution: blt/bge/beq decide in EX; IF may need to discard prefetched instructions. ### Memory Access (MEM) Role: Data memory loads/stores. In your code, MEM is mostly stack traffic at function boundaries. Typical instructions: sw ra/s0/… on entry; lw on exit to restore. The algorithm itself keeps most working data in registers, so MEM is dominated by prologue/epilogue traffic. Note: If the simulator models cache/latency, lw followed immediately by a consumer can cause a load-use stall without forwarding. ### Register Write Back (WB) Function returns: clz, uf8_decode, uf8_encode write results to a0; callers often consume them in the next instruction (RAW hazard → rely on forwarding). Loop variables: addi s2, s2, 1 (i++) is written back and then read right away by bge s2, 256. ![image](https://hackmd.io/_uploads/HyPEhmITxx.png) This routine (test) validates the round-trip correctness and monotonicity of a custom 8-bit “uf8” number format by exhaustively iterating all 256 encodings. For each fl ∈ [0,255], it: decodes fl → value = uf8_decode(fl), re-encodes value → fl2 = uf8_encode(value), checks that fl2 == fl (round-trip), and enforces value is strictly increasing over fl (monotonicity). If any check fails, a boolean flag passed is cleared. The function returns 1 (pass) or 0 (fail) in a0. ![image](https://hackmd.io/_uploads/Syg1-mzRge.png) ## problem C ### static inline bool bf16_isnan(bf16_t a) ``` .text .globl main main: li a0, 0x7FC0 jal ra, bf16_isnan mv s0, a0 li a0, 0x7F80 jal ra, bf16_isnan mv s1, a0 done: j done bf16_isnan: li t0, 0x7F80 and t1, a0, t0 bne t1, t0, is_not_nan li t0, 0x007F and t1, a0, t0 sltu a0, zero, t1 ret is_not_nan: li a0, 0 ret ``` ### static inline bool bf16_isinf(bf16_t a) ``` .text .globl main main: li a0, 0x7F80 jal ra, bf16_isinf mv s0, a0 li a0, 0x7FC0 jal ra, bf16_isinf mv s1, a0 done: j done bf16_isinf: li t0, 0x7F80 and t1, a0, t0 bne t1, t0, is_not_inf li t0, 0x007F and t1, a0, t0 seqz a0, t1 ret is_not_inf: li a0, 0 ret ``` ### static inline bool bf16_iszero(bf16_t a) ``` .text .globl main main: li a0, 0x8000 jal ra, bf16_iszero mv s0, a0 li a0, 0x3F80 jal ra, bf16_iszero mv s1, a0 done: j done bf16_iszero: li t0, 0x7FFF and t1, a0, t0 seqz a0, t1 ret ``` ### static inline bf16_t f32_to_bf16(float val) ``` .text .globl main main: li a0, 0x7F800000 jal ra, f32_to_bf16 mv s0, a0 li a0, 0x40000000 jal ra, f32_to_bf16 mv s1, a0 li a0, 0x40490FDB jal ra, f32_to_bf16 mv s2, a0 done: j done f32_to_bf16: srli t0, a0, 23 andi t0, t0, 0xFF li t1, 0xFF bne t0, t1, handle_normal handle_special: srli a0, a0, 16 ret handle_normal: srli t0, a0, 16 andi t0, t0, 1 li t1, 0x7FFF add t0, t0, t1 add a0, a0, t0 srli a0, a0, 16 ret ``` ### static inline float bf16_to_f32(bf16_t val) ``` .text .globl main main: li a0, 0x4000 jal ra, bf16_to_f32 mv s0, a0 li a0, 0x7F80 jal ra, bf16_to_f32 mv s1, a0 done: j done bf16_to_f32: slli a0, a0, 16 ret ``` ### static inline bf16_t bf16_add(bf16_t a, bf16_t b) ``` .text .globl main main: li a0, 0x3F80 li a1, 0x4000 jal ra, bf16_add mv s0, a0 li a0, 0x4040 li a1, 0xBF80 jal ra, bf16_add mv s1, a0 li a0, 0x3FC0 li a1, 0x3FC0 jal ra, bf16_add mv s2, a0 done: j done bf16_add: addi sp, sp, -48 sw ra, 44(sp) sw s0, 40(sp) # s0: a sw s1, 36(sp) # s1: b sw s2, 32(sp) # s2: sign_a sw s3, 28(sp) # s3: sign_b sw s4, 24(sp) # s4: exp_a sw s5, 20(sp) # s5: exp_b sw s6, 16(sp) # s6: mant_a sw s7, 12(sp) # s7: mant_b sw s8, 8(sp) # s8: result_sign sw s9, 4(sp) # s9: result_exp sw s10, 0(sp) # s10: result_mant mv s0, a0 mv s1, a1 srli s2, s0, 15 # s2 = sign_a andi s2, s2, 1 srli s3, s1, 15 # s3 = sign_b andi s3, s3, 1 srli s4, s0, 7 # s4 = exp_a andi s4, s4, 0xFF srli s5, s1, 7 # s5 = exp_b andi s5, s5, 0xFF andi s6, s0, 0x7F # s6 = mant_a andi s7, s1, 0x7F # s7 = mant_b li t0, 0xFF bne s4, t0, check_b_special bne s6, zero, return_a bne s5, t0, return_a bne s7, zero, return_b beq s2, s3, return_b li a0, 0x7FC0 j epilogue check_b_special: bne s5, t0, check_zeros bne s7, zero, return_b j return_b check_zeros: or t0, s4, s6 beq t0, zero, return_b or t0, s5, s7 beq t0, zero, return_a li t0, 0x80 bne s4, zero, add_implicit_a j check_implicit_b add_implicit_a: or s6, s6, t0 check_implicit_b: bne s5, zero, add_implicit_b j align_exponents add_implicit_b: or s7, s7, t0 align_exponents: sub t0, s4, s5 bgtz t0, a_exp_larger bltz t0, b_exp_larger mv s9, s4 j add_sub_mantissas a_exp_larger: mv s9, s4 li t1, 8 bgt t0, t1, return_a srl s7, s7, t0 j add_sub_mantissas b_exp_larger: mv s9, s5 neg t0, t0 li t1, 8 bgt t0, t1, return_b srl s6, s6, t0 add_sub_mantissas: bne s2, s3, different_signs same_signs: mv s8, s2 add s10, s6, s7 andi t0, s10, 0x100 beq t0, zero, pack_result srli s10, s10, 1 addi s9, s9, 1 li t0, 0xFF bge s9, t0, return_infinity j pack_result different_signs: bge s6, s7, a_mant_larger mv s8, s3 sub s10, s7, s6 j normalize_sub_result a_mant_larger: mv s8, s2 sub s10, s6, s7 normalize_sub_result: beq s10, zero, return_pos_zero normalize_loop: andi t0, s10, 0x80 bne t0, zero, pack_result slli s10, s10, 1 addi s9, s9, -1 blez s9, return_pos_zero j normalize_loop pack_result: slli t0, s8, 15 andi s9, s9, 0xFF slli t1, s9, 7 andi t2, s10, 0x7F or a0, t0, t1 or a0, a0, t2 j epilogue return_a: mv a0, s0 j epilogue return_b: mv a0, s1 j epilogue return_pos_zero: li a0, 0 j epilogue return_infinity: slli t0, s8, 15 li t1, 0x7F80 or a0, t0, t1 j epilogue epilogue: lw s10, 0(sp) lw s9, 4(sp) lw s8, 8(sp) lw s7, 12(sp) lw s6, 16(sp) lw s5, 20(sp) lw s4, 24(sp) lw s3, 28(sp) lw s2, 32(sp) lw s1, 36(sp) lw s0, 40(sp) lw ra, 44(sp) addi sp, sp, 48 ret ``` ``` 00000000 <main>: 0: 00004537 lui x10 0x4 4: f8050513 addi x10 x10 -128 8: 000045b7 lui x11 0x4 c: 03c000ef jal x1 60 <bf16_add> 10: 00050413 addi x8 x10 0 14: 00004537 lui x10 0x4 18: 04050513 addi x10 x10 64 1c: 0000c5b7 lui x11 0xc 20: f8058593 addi x11 x11 -128 24: 024000ef jal x1 36 <bf16_add> 28: 00050493 addi x9 x10 0 2c: 00004537 lui x10 0x4 30: fc050513 addi x10 x10 -64 34: 000045b7 lui x11 0x4 38: fc058593 addi x11 x11 -64 3c: 00c000ef jal x1 12 <bf16_add> 40: 00050913 addi x18 x10 0 00000044 <done>: 44: 0000006f jal x0 0 <done> 00000048 <bf16_add>: 48: fd010113 addi x2 x2 -48 4c: 02112623 sw x1 44 x2 50: 02812423 sw x8 40 x2 54: 02912223 sw x9 36 x2 58: 03212023 sw x18 32 x2 5c: 01312e23 sw x19 28 x2 60: 01412c23 sw x20 24 x2 64: 01512a23 sw x21 20 x2 68: 01612823 sw x22 16 x2 6c: 01712623 sw x23 12 x2 70: 01812423 sw x24 8 x2 74: 01912223 sw x25 4 x2 78: 01a12023 sw x26 0 x2 7c: 00050413 addi x8 x10 0 80: 00058493 addi x9 x11 0 84: 00f45913 srli x18 x8 15 88: 00197913 andi x18 x18 1 8c: 00f4d993 srli x19 x9 15 90: 0019f993 andi x19 x19 1 94: 00745a13 srli x20 x8 7 98: 0ffa7a13 andi x20 x20 255 9c: 0074da93 srli x21 x9 7 a0: 0ffafa93 andi x21 x21 255 a4: 07f47b13 andi x22 x8 127 a8: 07f4fb93 andi x23 x9 127 ac: 0ff00293 addi x5 x0 255 b0: 025a1063 bne x20 x5 32 <check_b_special> b4: 100b1463 bne x22 x0 264 <return_a> b8: 105a9263 bne x21 x5 260 <return_a> bc: 100b9463 bne x23 x0 264 <return_b> c0: 11390263 beq x18 x19 260 <return_b> c4: 00008537 lui x10 0x8 c8: fc050513 addi x10 x10 -64 cc: 11c0006f jal x0 284 <epilogue> 000000d0 <check_b_special>: d0: 005a9663 bne x21 x5 12 <check_zeros> d4: 0e0b9863 bne x23 x0 240 <return_b> d8: 0ec0006f jal x0 236 <return_b> 000000dc <check_zeros>: dc: 016a62b3 or x5 x20 x22 e0: 0e028263 beq x5 x0 228 <return_b> e4: 017ae2b3 or x5 x21 x23 e8: 0c028a63 beq x5 x0 212 <return_a> ec: 08000293 addi x5 x0 128 f0: 000a1463 bne x20 x0 8 <add_implicit_a> f4: 0080006f jal x0 8 <check_implicit_b> 000000f8 <add_implicit_a>: f8: 005b6b33 or x22 x22 x5 000000fc <check_implicit_b>: fc: 000a9463 bne x21 x0 8 <add_implicit_b> 100: 0080006f jal x0 8 <align_exponents> 00000104 <add_implicit_b>: 104: 005bebb3 or x23 x23 x5 00000108 <align_exponents>: 108: 415a02b3 sub x5 x20 x21 10c: 00504863 blt x0 x5 16 <a_exp_larger> 110: 0202c063 blt x5 x0 32 <b_exp_larger> 114: 000a0c93 addi x25 x20 0 118: 02c0006f jal x0 44 <add_sub_mantissas> 0000011c <a_exp_larger>: 11c: 000a0c93 addi x25 x20 0 120: 00800313 addi x6 x0 8 124: 08534c63 blt x6 x5 152 <return_a> 128: 005bdbb3 srl x23 x23 x5 12c: 0180006f jal x0 24 <add_sub_mantissas> 00000130 <b_exp_larger>: 130: 000a8c93 addi x25 x21 0 134: 405002b3 sub x5 x0 x5 138: 00800313 addi x6 x0 8 13c: 08534463 blt x6 x5 136 <return_b> 140: 005b5b33 srl x22 x22 x5 00000144 <add_sub_mantissas>: 144: 03391463 bne x18 x19 40 <different_signs> 00000148 <same_signs>: 148: 00090c13 addi x24 x18 0 14c: 017b0d33 add x26 x22 x23 150: 100d7293 andi x5 x26 256 154: 04028663 beq x5 x0 76 <pack_result> 158: 001d5d13 srli x26 x26 1 15c: 001c8c93 addi x25 x25 1 160: 0ff00293 addi x5 x0 255 164: 065cd863 bge x25 x5 112 <return_infinity> 168: 0380006f jal x0 56 <pack_result> 0000016c <different_signs>: 16c: 017b5863 bge x22 x23 16 <a_mant_larger> 170: 00098c13 addi x24 x19 0 174: 416b8d33 sub x26 x23 x22 178: 00c0006f jal x0 12 <normalize_sub_result> 0000017c <a_mant_larger>: 17c: 00090c13 addi x24 x18 0 180: 417b0d33 sub x26 x22 x23 00000184 <normalize_sub_result>: 184: 040d0463 beq x26 x0 72 <return_pos_zero> 00000188 <normalize_loop>: 188: 080d7293 andi x5 x26 128 18c: 00029a63 bne x5 x0 20 <pack_result> 190: 001d1d13 slli x26 x26 1 194: fffc8c93 addi x25 x25 -1 198: 03905a63 bge x0 x25 52 <return_pos_zero> 19c: fedff06f jal x0 -20 <normalize_loop> 000001a0 <pack_result>: 1a0: 00fc1293 slli x5 x24 15 1a4: 0ffcfc93 andi x25 x25 255 1a8: 007c9313 slli x6 x25 7 1ac: 07fd7393 andi x7 x26 127 1b0: 0062e533 or x10 x5 x6 1b4: 00756533 or x10 x10 x7 1b8: 0300006f jal x0 48 <epilogue> 000001bc <return_a>: 1bc: 00040513 addi x10 x8 0 1c0: 0280006f jal x0 40 <epilogue> 000001c4 <return_b>: 1c4: 00048513 addi x10 x9 0 1c8: 0200006f jal x0 32 <epilogue> 000001cc <return_pos_zero>: 1cc: 00000513 addi x10 x0 0 1d0: 0180006f jal x0 24 <epilogue> 000001d4 <return_infinity>: 1d4: 00fc1293 slli x5 x24 15 1d8: 00008337 lui x6 0x8 1dc: f8030313 addi x6 x6 -128 1e0: 0062e533 or x10 x5 x6 1e4: 0040006f jal x0 4 <epilogue> 000001e8 <epilogue>: 1e8: 00012d03 lw x26 0 x2 1ec: 00412c83 lw x25 4 x2 1f0: 00812c03 lw x24 8 x2 1f4: 00c12b83 lw x23 12 x2 1f8: 01012b03 lw x22 16 x2 1fc: 01412a83 lw x21 20 x2 200: 01812a03 lw x20 24 x2 204: 01c12983 lw x19 28 x2 208: 02012903 lw x18 32 x2 20c: 02412483 lw x9 36 x2 210: 02812403 lw x8 40 x2 214: 02c12083 lw x1 44 x2 218: 03010113 addi x2 x2 48 21c: 00008067 jalr x0 x1 0 ``` ![image](https://hackmd.io/_uploads/HyoH-Xz0el.png) ### static inline bf16_t bf16_sub(bf16_t a, bf16_t b) ``` .text .globl main main: li a0, 0x4040 # a = 3.0 li a1, 0x3F80 # b = 1.0 jal ra, bf16_sub mv s0, a0 # 0x4000 li a0, 0x3F80 # a = 1.0 li a1, 0x4040 # b = 3.0 jal ra, bf16_sub mv s1, a0 # 0xC000 li a0, 0x40A0 # a = 5.0 li a1, 0x40A0 # b = 5.0 jal ra, bf16_sub mv s2, a0 # 0x0000 done: j done bf16_sub: li t0, 0x8000 xor a1, a1, t0 j bf16_add ``` ### static inline bf16_t bf16_mul(bf16_t a, bf16_t b) ``` .text .globl main .globl bf16_mul main: li a0, 0x4000 # a = 2.0 (bf16) li a1, 0x4040 # b = 3.0 (bf16) jal ra, bf16_mul # a0 <- result mv a2, a0 # 0x40C0 #li a0, 0xC000 # a = -2.0 #li a1, 0x4040 # b = 3.0 #jal ra, bf16_mul #mv a3, a0 # 0xC0C0 #li a0, 0x7F80 # a = +Inf #li a1, 0x40A0 # b = 5.0 #jal ra, bf16_mul #mv a4, a0 # 0x7F80 done: j done bf16_mul: srli t0, a0, 15 # sign_a andi t0, t0, 1 srli t1, a1, 15 # sign_b andi t1, t1, 1 xor t6, t0, t1 # result_sign -> t6 srli a2, a0, 7 # exp_a andi a2, a2, 0xFF srli a3, a1, 7 # exp_b andi a3, a3, 0xFF andi a4, a0, 0x7F # mant_a andi a5, a1, 0x7F # mant_b li t2, 0xFF bne a2, t2, 1f bnez a4, .ret_a or t3, a3, a5 beqz t3, .ret_qnan j .ret_inf 1: bne a3, t2, 2f bnez a5, .ret_b or t3, a2, a4 beqz t3, .ret_qnan j .ret_inf 2: or t3, a2, a4 beqz t3, .ret_zero or t3, a3, a5 beqz t3, .ret_zero li a6, 0 bnez a2, 3f 0: andi t3, a4, 0x80 bnez t3, 4f slli a4, a4, 1 addi a6, a6, -1 j 0b 4: li a2, 1 3: ori a4, a4, 0x80 bnez a3, 5f 6: andi t3, a5, 0x80 bnez t3, 7f slli a5, a5, 1 addi a6, a6, -1 j 6b 7: li a3, 1 5: ori a5, a5, 0x80 add a7, a2, a3 addi t3, x0, 127 sub a7, a7, t3 add a7, a7, a6 mv t0, a4 mv t1, a5 li t4, 0 li t5, 8 mul_loop: andi t2, t1, 1 beqz t2, 8f add t4, t4, t0 8: slli t0, t0, 1 srli t1, t1, 1 addi t5, t5, -1 bnez t5, mul_loop li t0, 0x8000 and t1, t4, t0 beqz t1, 9f srli t4, t4, 8 addi a7, a7, 1 j 10f 9: srli t4, t4, 7 10: andi t4, t4, 0x7F li t0, 0xFF bge a7, t0, .ret_inf blez a7, 11f j pack 11: li t0, -6 blt a7, t0, .ret_zero li t1, 1 sub t1, t1, a7 srl t4, t4, t1 li a7, 0 pack: slli t0, t6, 15 slli t1, a7, 7 or a0, t0, t1 or a0, a0, t4 ret .ret_a: ret .ret_b: mv a0, a1 ret .ret_inf: li a0, 0x7F80 slli t0, t6, 15 or a0, a0, t0 ret .ret_qnan: li a0, 0x7FC0 ret .ret_zero: slli a0, t6, 15 ret ``` ### static inline bf16_t bf16_div(bf16_t a, bf16_t b) ``` .text .globl bf16_div .globl main main: li a0, 0x40C0 # a = 6.0 (bf16) li a1, 0x4040 # b = 3.0 (bf16) jal ra, bf16_div # a0 <- result mv a2, a0 # 0x4000 done: j done bf16_div: srli t0, a0, 15 # sign_a andi t0, t0, 1 srli t1, a1, 15 # sign_b andi t1, t1, 1 xor t6, t0, t1 # result_sign -> t6 srli a2, a0, 7 # exp_a andi a2, a2, 0xFF srli a3, a1, 7 # exp_b andi a3, a3, 0xFF andi a4, a0, 0x7F # mant_a andi a5, a1, 0x7F # mant_b li t2, 0xFF bne a3, t2, 1f bnez a5, .ret_b # b = Inf beq a2, t2, .ret_bf16_nan j .ret_signed_zero 1: or t0, a3, a5 bnez t0, 2f or t1, a2, a4 beqz t1, .ret_bf16_nan j .ret_signed_inf 2: bne a2, t2, 3f bnez a4, .ret_a j .ret_signed_inf 3: or t0, a2, a4 beqz t0, .ret_signed_zero bnez a2, 4f j 5f 4: ori a4, a4, 0x80 5: bnez a3, 6f j 7f 6: ori a5, a5, 0x80 7: slli t4, a4, 15 # dividend add t5, x0, a5 # divisor li t2, 0 # quotient li t3, 0 # i add a7, a2, x0 sub a7, a7, a3 addi a7, a7, 127 bnez a2, 8f addi a7, a7, -1 # !exp_a → -- 8: bnez a3, 9f addi a7, a7, 1 # !exp_b → ++ 9: div_loop: slli t2, t2, 1 # quotient <<= 1 li t0, 15 sub t0, t0, t3 # k = 15 - i sll t1, t5, t0 # tmp = divisor << k sltu t0, t4, t1 # dividend < tmp ? bne t0, x0, 10f sub t4, t4, t1 # dividend -= tmp ori t2, t2, 1 # quotient |= 1 10: addi t3, t3, 1 li t0, 16 blt t3, t0, div_loop li t0, 0x8000 and t1, t2, t0 bnez t1, 11f norm_loop: and t1, t2, t0 bnez t1, 12f li t1, 1 ble a7, t1, 12f slli t2, t2, 1 addi a7, a7, -1 j norm_loop 11: srli t2, t2, 8 j 13f 12: srli t2, t2, 8 13: andi t2, t2, 0x7F li t0, 0xFF bge a7, t0, .ret_signed_inf blez a7, .ret_signed_zero slli t0, t6, 15 # sign<<15 slli t1, a7, 7 # (exp&0xFF)<<7 or a0, t0, t1 or a0, a0, t2 # | mant(7 bits) ret .ret_a: ret .ret_b: mv a0, a1 ret .ret_bf16_nan: li a0, 0x7FC0 ret .ret_signed_inf: li a0, 0x7F80 slli t0, t6, 15 or a0, a0, t0 ret .ret_signed_zero: slli a0, t6, 15 ret ```