Assignment1: RISC-V Assembly and Instruction Pipeline 2025

# Assignment1: RISC-V Assembly and Instruction Pipeline contributed by < [`winnie0710`](https://github.com/winnie0710/RISC-V) > ## Problem B : Encode uint32_t to uf8 ### Abstract #### goal * Translate from C code into full RISC-V assembly programs * Develop a simplified but informative use case to frame your assignment * Include at least three test data cases ### clz Use binary search to find the position of the first 1. The number of operations can be reduced from a maximum of 32 to a fixed number of 5 iterations. ```c static inline unsigned clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return n - x; } ``` The corresponding RISC-V assembly are showing below. ```s .data test_x: .word 0x00001234 str: .asciz "CLZ Result: " # add "\0" on the end of string .text main: la t1, test_x lw a0, 0(t1) jal ra, clz mv a1, a0 # Put the result to a1 first to avoid being overwritten jal ra, print # 結束程式 li a7, 10 # Load System Service No. 10 (Exit) ecall # Execute the system call to exit the program clz: # assume that input parameter x is stored at a0 addi s0, x0, 32 # n=32 addi s1, x0, 16 # c=16 mv s2, a0 addi t0, x0, 0 # y=0 clz_loop: srl t0, s2, s1 # y = x >> c bnez t0, clz_y_exist srli s1, s1, 1 bnez s1, clz_loop j clz_finish clz_y_exist: sub s0, s0, s1 mv s2, t0 srli s1, s1, 1 bnez s1, clz_loop clz_finish: sub a0, s0, s2 ret # return to main print: la a0, str # load string li a7, 4 # print string ecall mv a0, a1 li a7, 1 # print integer ecall ret # go back to main ``` ``` la a0, msg_test # "test data: " li a7, 4 # a7=4, print string ecall mv a0, s2 # print i li a7, 1 # a7=1, print integer ecall ``` ### uf8_decode offset ```c /* Decode uf8 to uint32_t */ uint32_t uf8_decode(uf8 fl) { uint32_t mantissa = fl & 0x0f; // last 4 bit uint8_t exponent = fl >> 4; // first 4 bit uint32_t offset = (0x7FFF >> (15 - exponent)) << 4; return (mantissa << exponent) + offset; } ``` The corresponding RISC-V assembly are showing below. ```s .data .text main: li a0, 0x2B jal ra, decode li a7, 10 ecall decode: andi t0, a0, 0x0F # m = fl & 0x0f srli t1, a0, 4 # e = fl >> 4 li t2, 1 sll t2, t2, t1 addi t2, t2, -1 slli t2, t2, 4 # offset = ((1<<e)-1) << 4 sll t0, t0, t1 add a0, t0, t2 # value = (m << e) + offset ret ``` ### uf8_encode $$\ value = overflow + mantissa * 2^{exponent}$$ ```c uf8 uf8_encode(uint32_t value) { /* Use CLZ for fast exponent calculation */ if (value < 16) return value; /* Find appropriate exponent using CLZ hint */ int lz = clz(value); int msb = 31 - lz; /* Start from a good initial guess */ uint8_t exponent = 0; uint32_t overflow = 0; if (msb >= 5) { /* Estimate exponent - the formula is empirical */ exponent = msb - 4; if (exponent > 15) exponent = 15; /* Calculate overflow for estimated exponent */ for (uint8_t e = 0; e < exponent; e++) overflow = (overflow << 1) + 16; /* Adjust if estimate was off */ while (exponent > 0 && value < overflow) { overflow = (overflow - 16) >> 1; exponent--; } } /* Find exact exponent */ while (exponent < 15) { uint32_t next_overflow = (overflow << 1) + 16; if (value < next_overflow) break; overflow = next_overflow; exponent++; } uint8_t mantissa = (value - overflow) >> exponent; return (exponent << 4) | mantissa; } ``` The corresponding RISC-V assembly are showing below. ```s .data .text main: li a0, 0x00235656 jal ra, start_encode li a7, 10 ecall start_encode: mv s0, a0 # store a0 value addi s1, x0, 31 # exponent addi s2, x0, 0 # overflow li t0, 0 li t1, 0 # msb li t2, 1 bge a0, t0, encode ret # return encode: jal ra, clz # a0 = lz sub t1, s1, a0 # msb = 31 - clz li t0, 5 blt t1, t0, exact_exponent # msb < 5, go to exact_exponent addi s1, t1, -4 # exponent = msb-4 li t0, 16 li t1, 0 # set calculate_overflow counting number blt s1, t0, calculate_overflow # exponent < 16, go to overflow addi s1, x0, 15 # exponent = 15 calculate_overflow: slli s2, s2, 1 addi s2, s2, 16 addi t1, t1, 1 blt t1, s1, calculate_overflow # e < exponent adjust_overflow: blt s1, t2, exact_exponent # exponent < 1, jump bge s0, s2, exact_exponent # addi s2, s2, -16 srli s2, s2, 1 addi s1, s1, -1 j adjust_overflow exact_exponent: addi t2, x0, 15 bge s1, t2, finish # exponent >= 15, out of while slli t1, s2, 1 # set next_exponent addi t1, t1, 16 blt s0, t1, finish mv s2, t1 addi s1, s1, 1 j exact_exponent clz: # assume that input parameter x is stored at a0 addi sp, sp, -12 sw s0, 0(sp) sw s1, 4(sp) sw s2, 8(sp) addi s0, x0, 32 # n=32 addi s1, x0, 16 # c=16 mv s2, a0 addi t0, x0, 0 # y=0 clz_loop: srl t0, s2, s1 # y = x >> c bnez t0, clz_y_exist srli s1, s1, 1 bnez s1, clz_loop j clz_finish clz_y_exist: sub s0, s0, s1 mv s2, t0 srli s1, s1, 1 bnez s1, clz_loop clz_finish: sub a0, s0, s2 lw s0, 0(sp) lw s1, 4(sp) lw s2, 8(sp) addi sp, sp, 12 ret # return to encode finish: sub s0, s0, s2 # value ->mantissa srl s0, s0, s1 slli s1, s1, 4 or a0, s1, s0 ret # return to main ``` ### Test :::spoiler fully RISC-V assemble ```code .data msg_pass: .asciz "All tests passed." msg_fail: .asciz "Failed." msg_mismatch_a: .asciz " mismatch: fl= " msg_mismatch_b: .asciz " value= " msg_mismatch_c: .asciz " fl2= " msg_nl: .asciz "\n" msg_exponent: .asciz "Exponent: " msg_1round: .asciz "passed = " .text main: jal ra, test # start test beqz a0, print_fail # a0=0, fail la a0, msg_pass li a7, 4 ecall li a7, 10 ecall print_fail: la a0, msg_fail li a7, 4 ecall li a7, 10 ecall decode: andi t0, a0, 0x0F # m = fl & 0x0f srli t1, a0, 4 # e = fl >> 4 li t2, 1 sll t2, t2, t1 addi t2, t2, -1 slli t2, t2, 4 # offset = ((1<<e)-1) << 4 sll t0, t0, t1 add a0, t0, t2 # value = (m << e) + offset ret encode: addi sp, sp, -16 sw ra, 0(sp) sw s0, 4(sp) sw s1, 8(sp) sw s2, 12(sp) mv s0, a0 # store a0 value addi s1, x0, 0 # exponent addi s2, x0, 0 # overflow li t1, 0 # msb li t2, 1 jal ra, clz # a0 = lz li t0, 31 sub t1, t0, a0 # msb = 31 - clz li t0, 5 blt t1, t0, exact_exponent # msb < 5, go to exact_exponent addi s1, t1, -4 # exponent = msb-4 li t0, 16 li t1, 0 # set calculate_overflow counting number blt s1, t0, calculate_overflow # exponent < 16, go to overflow addi s1, x0, 15 # exponent = 15 calculate_overflow: slli s2, s2, 1 addi s2, s2, 16 addi t1, t1, 1 blt t1, s1, calculate_overflow # e < exponent adjust_overflow: blt s1, t2, exact_exponent # exponent < 1, jump bge s0, s2, exact_exponent # addi s2, s2, -16 srli s2, s2, 1 addi s1, s1, -1 j adjust_overflow exact_exponent: addi t2, x0, 15 bge s1, t2, encode_finish # exponent >= 15, out of while slli t1, s2, 1 # set next_exponent addi t1, t1, 16 blt s0, t1, encode_finish mv s2, t1 addi s1, s1, 1 j exact_exponent clz: # assume that input parameter x is stored at a0 addi sp, sp, -16 sw s0, 0(sp) sw s1, 4(sp) sw s2, 8(sp) sw ra, 12(sp) addi s0, x0, 32 # n=32 addi s1, x0, 16 # c=16 mv s2, a0 addi t0, x0, 0 # y=0 clz_loop: srl t0, s2, s1 # y = x >> c bnez t0, clz_y_exist srli s1, s1, 1 bnez s1, clz_loop j clz_finish clz_y_exist: sub s0, s0, s1 mv s2, t0 srli s1, s1, 1 bnez s1, clz_loop clz_finish: sub a0, s0, s2 lw s0, 0(sp) lw s1, 4(sp) lw s2, 8(sp) lw ra, 12(sp) addi sp, sp, 16 ret # return to encode encode_finish: la a0, msg_exponent li a7, 4 ecall mv a0, s1 # exponent li a7, 1 ecall sub s0, s0, s2 # value ->mantissa srl s0, s0, s1 slli s1, s1, 4 or a0, s1, s0 lw ra, 0(sp) lw s0, 4(sp) lw s1, 8(sp) lw s2, 12(sp) addi sp, sp, 16 ret # return to main test: addi sp, sp, -20 sw ra, 0(sp) sw s0, 4(sp) # previous_value sw s1, 8(sp) # passed sw s2, 12(sp) # i sw s3, 16(sp) # value addi s0, x0, -1 # previous_value = -1 addi s1, x0, 1 # passed = 1 addi s2, x0, 0 # i = 0 test_loop: addi t1, x0, 256 bge s2, t1, test_done mv a0, s2 jal ra, decode mv s3, a0 # s3 = value jal ra, encode mv t1, a0 # t1 = fl2 la a0, msg_mismatch_a li a7, 4 ecall mv a0, s2 # fl li a7, 1 ecall la a0, msg_mismatch_b li a7, 4 ecall mv a0, s3 # value li a7, 1 ecall la a0, msg_mismatch_c li a7, 4 ecall mv a0, t1 # fl2 li a7, 1 ecall la a0, msg_nl li a7, 4 ecall bne t1, s2, set_false # fail slt t0, s0, s3 # ok bnez t0, test_plus set_false: addi s1, x0, 0 # passed = false test_plus: mv s0, s3 # previous_value = value addi s2, s2, 1 # i++ j test_loop test_done: mv a0, s1 lw ra, 0(sp) lw s0, 4(sp) # previous_value lw s1, 8(sp) # passed lw s2, 12(sp) # i lw s3, 16(sp) # value addi sp, sp, 20 ret ``` ::: ### Analysis kkk ## Problem C : bfloat16 format ### Abstract #### goal Manually simulate floating-point arithmetic using bitwise operations directly on a 16-bit unsigned register. ```c typedef struct { uint16_t bits; } bf16_t; ``` ### bf16_add ```c static inline bf16_t bf16_add(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; if (exp_a == 0xFF) { if (mant_a) return a; if (exp_b == 0xFF) return (mant_b || sign_a == sign_b) ? b : BF16_NAN(); return a; } if (exp_b == 0xFF) return b; if (!exp_a && !mant_a) return b; if (!exp_b && !mant_b) return a; if (exp_a) mant_a |= 0x80; if (exp_b) mant_b |= 0x80; int16_t exp_diff = exp_a - exp_b; uint16_t result_sign; int16_t result_exp; uint32_t result_mant; if (exp_diff > 0) { result_exp = exp_a; if (exp_diff > 8) return a; mant_b >>= exp_diff; } else if (exp_diff < 0) { result_exp = exp_b; if (exp_diff < -8) return b; mant_a >>= -exp_diff; } else { result_exp = exp_a; } if (sign_a == sign_b) { result_sign = sign_a; result_mant = (uint32_t) mant_a + mant_b; if (result_mant & 0x100) { result_mant >>= 1; if (++result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } } else { if (mant_a >= mant_b) { result_sign = sign_a; result_mant = mant_a - mant_b; } else { result_sign = sign_b; result_mant = mant_b - mant_a; } if (!result_mant) return BF16_ZERO(); while (!(result_mant & 0x80)) { result_mant <<= 1; if (--result_exp <= 0) return BF16_ZERO(); } } return (bf16_t) { .bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (result_mant & 0x7F), }; } ``` The corresponding RISC-V assembly are showing below. ```s main: # a0 -> bf16_t a srli s0, a0, 15 # sign a, 15 andi s0, s0, 1 srli s1, a0, 7 # exp a, 7-14 andi s1, s1, 0xFF andi s2, a1, 0x7F # mantissa a, 0-6 #a1 -> bf16_t b srli s3, a1, 15 # sign b, 15 andi s3, s3, 1 srli s4, a1, 7 # exp b, 7-14 andi s4, s4, 0xFF andi s5, a1, 0x7F # mantissa b, 0-6 ``` ### bf16_mul ```c ``` The corresponding RISC-V assembly are showing below. ```s ``` ## Leetcode137. Single Number II Description: Given an integer array nums where every element appears three times except for one, which appears exactly once. Find the single element and return it. You must implement a solution with a linear runtime complexity and use only constant extra space. Constraints: * 1 <= nums.length <= 3 * 104 * -2^31 <= nums[i] <= 2^31 - 1 * Each element in nums appears exactly three times except for one element which appears once. ### Soltion #### C First Version ```c static inline int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; --i) { if (x & (1U << i)) break; count++; } return count; } int singleNumber(int* nums, int numsSize) { int result = 0; int min_leading_zeros = 32; for (int i = 0; i < numsSize; i++) { if (nums[i] != 0) { int leading_zeros = my_clz((uint32_t)nums[i]); if (leading_zeros < min_leading_zeros) { min_leading_zeros = leading_zeros; } } } for (int i = 0; i < 32 - min_leading_zeros; i++) { int count = 0; for (int j = 0; j < numsSize; j++) { if (nums[j] & (1U << i)) { count++; } } if (count % 3 != 0) { result |= (1U << i); } } return result; } ``` #### Assembly code ```s .data # Test Case 1: nums = [2, 2, 3, 2] nums1: .word 2, 2, 3, 2 nums1_size: .word 4 result_str: .asciz "Result: " newline_str: .asciz "\n" int_buffer: .zero 12 # Allocate and zero-initialize 12 bytes .text .globl main main: # Test Case 1 la a0, nums1 # Load address of nums1 array lw a1, nums1_size # Load size of nums1 array jal ra, singleNumber # Call singleNumber(nums1, size) mv a2, a0 # Move result to a2 for printing la a0, result_str # Load address of result string jal ra, print_result # Print result # Exit program li a0, 0 # Exit code 0 li a7, 93 # ECALL for exit ecall # int singleNumber(int* nums, int numsSize) singleNumber: addi sp, sp, -36 # Allocate stack space sw ra, 32(sp) # Save return address sw s0, 28(sp) # Save s0 sw s1, 24(sp) # Save s1 sw s2, 20(sp) # Save s2 sw s3, 16(sp) # Save s3 sw s4, 12(sp) # Save s4 sw s5, 8(sp) # Save s5 sw s6, 4(sp) # Save s6 sw s7, 0(sp) # Save s7 mv s0, a0 # s0 = nums (int* nums) mv s1, a1 # s1 = numsSize (int numsSize) li s7, 0 # s7 = result = 0 # Initialize min_leading_zeros = 32 li s5, 32 # s5 = min_leading_zeros = 32 li s2, 0 # s2 = i = 0 find_min_leading_zeros_loop: bge s2, s1, end_min_leading_zeros_loop # Load nums[i] slli t0, s2, 2 # t0 = i * 4 add t0, s0, t0 # t0 = &nums[i] lw t1, 0(t0) # t1 = nums[i] bnez t1, process_non_zero addi s2, s2, 1 # i++ jal zero, find_min_leading_zeros_loop process_non_zero: # Call my_clz((uint32_t)nums[i]) mv a0, t1 # a0 = nums[i] jal ra, my_clz # leading_zeros in a0 mv t2, a0 # t2 = leading_zeros blt t2, s5, update_min_leading_zeros jal zero, skip_update_min_leading_zeros update_min_leading_zeros: mv s5, t2 # min_leading_zeros = leading_zeros skip_update_min_leading_zeros: addi s2, s2, 1 # i++ jal zero, find_min_leading_zeros_loop end_min_leading_zeros_loop: # Compute max_bit_index = 32 - min_leading_zeros li t0, 32 sub s6, t0, s5 # s6 = 32 - min_leading_zeros # Initialize s2 = 0 (i) li s2, 0 # s2 = i = 0 singleNumber_outer_loop: bge s2, s6, singleNumber_end_outer_loop # if i >= (32 - min_leading_zeros), exit loop # Initialize count = 0 li s3, 0 # s3 = count = 0 li s4, 0 # s4 = j = 0 singleNumber_inner_loop: bge s4, s1, singleNumber_end_inner_loop # if j >= numsSize, exit inner loop # Load nums[j] slli t1, s4, 2 # t1 = j * 4 add t1, s0, t1 # t1 = &nums[j] lw t2, 0(t1) # t2 = nums[j] # Check if nums[j] & (1U << i) li t3, 1 sll t3, t3, s2 # t3 = 1U << i and t4, t2, t3 # t4 = nums[j] & (1U << i) beqz t4, singleNumber_skip_increment # if zero, skip increment # count = (count + 1) % 3 addi s3, s3, 1 # count++ li t5, 3 blt s3, t5, singleNumber_skip_reset # if count < 3, skip reset li s3, 0 # count = 0 singleNumber_skip_reset: # Continue to next iteration singleNumber_skip_increment: addi s4, s4, 1 # j++ jal zero, singleNumber_inner_loop singleNumber_end_inner_loop: # if count != 0 beqz s3, singleNumber_skip_bit_set # if count == 0, skip setting bit # result |= (1U << i) li t6, 1 sll t6, t6, s2 # t6 = 1U << i or s7, s7, t6 # s7 |= t6 singleNumber_skip_bit_set: addi s2, s2, 1 # i++ jal zero, singleNumber_outer_loop singleNumber_end_outer_loop: mv a0, s7 # Move result to a0 # Restore registers and stack lw s7, 0(sp) lw s6, 4(sp) lw s5, 8(sp) lw s4, 12(sp) lw s3, 16(sp) lw s2, 20(sp) lw s1, 24(sp) lw s0, 28(sp) lw ra, 32(sp) addi sp, sp, 36 ret # int my_clz(uint32_t x) my_clz: addi sp, sp, -8 # Allocate stack space sw ra, 4(sp) # Save return address sw s0, 0(sp) # Save s0 mv s0, a0 # s0 = x # if x == 0, return 32 beqz s0, my_clz_return_32 # if x < 0 (i.e., (int32_t)x < 0), return 0 bltz s0, my_clz_return_0 # count = 0 li t0, 0 # t0 = count li t1, 31 # t1 = i my_clz_loop: blt t1, zero, my_clz_end_loop # if i < 0, end loop li t2, 1 sll t2, t2, t1 # t2 = 1U << i and t3, s0, t2 # t3 = x & (1U << i) bnez t3, my_clz_end_loop addi t0, t0, 1 # count++ addi t1, t1, -1 # i-- jal zero, my_clz_loop my_clz_end_loop: mv a0, t0 # Return count jal zero, my_clz_exit my_clz_return_32: li a0, 32 jal zero, my_clz_exit my_clz_return_0: li a0, 0 my_clz_exit: # Restore registers and return lw s0, 0(sp) lw ra, 4(sp) addi sp, sp, 8 ret # Simplified print_result function for Ripes simulator print_result: addi sp, sp, -8 # Allocate stack space sw ra, 4(sp) # Save return address sw a0, 0(sp) # Save a0 (address of result_str) # Print "Result: " mv a0, a0 # a0 already contains address of result_str li a7, 4 # ECALL for print string ecall # Print integer in a2 mv a0, a2 # Move result to a0 jal ra, print_int # Call print_int # Print newline la a0, newline_str li a7, 4 ecall # Restore registers and stack lw a0, 0(sp) # Restore a0 lw ra, 4(sp) # Restore ra addi sp, sp, 8 ret # Corrected print_int function print_int: # Convert integer to string and print # Handle negative numbers addi sp, sp, -36 # Adjust stack size for additional registers sw ra, 32(sp) sw t0, 28(sp) sw t1, 24(sp) sw t2, 20(sp) sw t3, 16(sp) sw t4, 12(sp) sw t5, 8(sp) sw a4, 4(sp) # Save a4 sw a5, 0(sp) # Save a5 mv t0, a0 # t0 = number la t1, int_buffer # t1 = address of int_buffer # Clear the buffer li t2, 12 # t2 = buffer size li t3, 0 # t3 = zero mv a5, t1 # a5 = buffer pointer clear_buffer_loop: beqz t2, clear_buffer_done sb t3, 0(a5) addi a5, a5, 1 addi t2, t2, -1 jal zero, clear_buffer_loop clear_buffer_done: # Reset t1 to end of buffer la t1, int_buffer # t1 = address of int_buffer addi t1, t1, 11 # t1 points to int_buffer + 11 (one before end) li t2, 48 # t2 = ASCII code for '0' li t3, 0 # t3 = sign flag li a4, 10 # a4 = 10 # Check if number is zero beq t0, zero, print_int_zero # Check if number is negative blt t0, zero, print_int_negative print_int_loop: # t4 = t0 / 10 (quotient) # t5 = t0 % 10 (remainder) # Initialize t4 = 0 (quotient), t5 = t0 (remainder) mv t5, t0 # t5 = t0 (remainder) li t4, 0 # t4 = 0 (quotient) print_int_divide_loop: blt t5, a4, print_int_divide_end sub t5, t5, a4 # t5 -= 10 addi t4, t4, 1 # t4 += 1 jal zero, print_int_divide_loop print_int_divide_end: # Now t4 is quotient, t5 is remainder # Convert remainder to character add t6, t5, t2 # t6 = t5 + '0' sb t6, 0(t1) # Store character at t1 addi t1, t1, -1 # t1 -= 1 # Update t0 to quotient mv t0, t4 # t0 = t4 # If t0 != 0, continue loop bnez t0, print_int_loop # After loop, check for sign beqz t3, print_int_print # If not negative, proceed to print # Add '-' sign li t6, 45 # '-' sb t6, 0(t1) addi t1, t1, -1 print_int_print: addi t1, t1, 1 # Adjust t1 to point to the start of string # Print the number string mv a0, t1 # a0 = pointer to start of string li a7, 4 ecall # Restore registers and stack lw a5, 0(sp) # Restore a5 lw a4, 4(sp) # Restore a4 lw t5, 8(sp) lw t4, 12(sp) lw t3, 16(sp) lw t2, 20(sp) lw t1, 24(sp) lw t0, 28(sp) lw ra, 32(sp) addi sp, sp, 36 ret print_int_negative: li t3, 1 # Set sign flag neg t0, t0 # t0 = -t0 jal zero, print_int_loop print_int_zero: # Handle zero li t6, 48 # '0' sb t6, 0(t1) addi t1, t1, -1 jal zero, print_int_print ``` ### Test gitgub ### Analysis cycle ![image](https://hackmd.io/_uploads/B18D_59agl.png) ## 5-stage pipelined processor