Assignment1: RISC-V Assembly and Instruction Pipeline

# Assignment1: RISC-V Assembly and Instruction Pipeline :::danger Choose one problem (A, B, or C) from [Quiz1](https://hackmd.io/@sysprog/arch2024-quiz1-sol), translate it from C code to a complete RISC-V assembly program, and include the relevant test data. ::: ## problem C First, perform conversion processing on fabsf. ```c= static inline float fabsf(float x) { uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value x = *(float *)&i; // Write the modified bits back into the float return x; } ``` ``` fabsf: li t1, 0x7FFFFFFF # mask and a0, a0, t1 # mutiply with float a0 ret ``` Next, handle __builtin_clz. ```c= static inline int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; --i) { if (x & (1U << i)) break; count++; } return count; } ``` ``` my_clz: li t0, 0 # count li t1, 31 # i=31 li t2, 1 # for shifting loop: slli t3, t2, t1 # t3 = 1 << i and t3, a0, t3 # t3 = x & (1 << i) bnez t3, end # if t3!=0, break addi t0, t0, 1 # count ++ addi t1, t1, -1 # i-- bgez t1, loop # loop end: mv a0, t0 ret ``` Finally, handle fp16_to_fp32. ```c= static inline uint32_t fp16_to_fp32(uint16_t h) { const uint32_t w = (uint32_t) h << 16; const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); uint32_t renorm_shift = my_clz(nonsign); renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31; return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask); } ``` expect code ```c= #include <stdio.h> #include <stdint.h> static inline int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; --i) { if (x & (1U << i)) break; count++; } return count; } static inline uint32_t fp16_to_fp32(uint16_t h) { const uint32_t w = (uint32_t) h << 16; const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); uint32_t renorm_shift = my_clz(nonsign); renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31; return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask); } int main() { unsigned short a = 0xAB00; printf("0x%X", fp16_to_fp32(a)); return 0; } ``` RISC-V ``` .data num: .word 0xAB00 .text main: lw a0,num #load num jal ra,fp16_to_fp32 #call fp16_to_fp32 mv a1,a0 #save result li a0, 34 # print format "0x" li a7, 11 ecall mv a0,a1 #restore result li a7,34 #print hex ecall li a7,10 #exit ecall fp16_to_fp32: mv t0,a0 # h = num slli t0,t0,16 # w = h<<16 li t1,0x80000000 and t1,t0,t1 # sign = w&0x80000000 li t2,0x7FFFFFFF and t2,t0,t2 # nosign = w&07FFFFFFF # Save registers before calling my_clz addi sp, sp, -8 sw ra, 4(sp) sw t1, 0(sp) # save sign mv a0, t2 # prepare nonsign as argument for my_clz jal ra,my_clz #call my_clz # Restore sign lw t1, 0(sp) lw ra, 4(sp) addi sp, sp, 8 mv t3, a0 # renorm_shift = clz result addi t3, t3, -5 # renorm_shift -= 5 bgt t3, zero, skip_reset li t3, 0 # if renorm_shift <= 0, set to 0 skip_reset: li t4, 0x04000000 add t4, t2, t4 # nonsign + 0x04000000 srli t4, t4, 8 # >> 8 li t5, 0x7F800000 and t4, t4, t5 # inf_nan_mask addi t5, t2, -1 # nonsign - 1 srli t5, t5, 31 # zero_mask = (nonsign-1)>>31 not t5, t5 # ~zero_mask sll t2, t2, t3 # nonsign << renorm_shift srli t2, t2, 3 # >> 3 li t6, 0x70 sub t3, t6, t3 # 0x70 - renorm_shift slli t3, t3, 23 # << 23 add t2, t2, t3 # add shifted values or t2, t2, t4 # | inf_nan_mask and t2, t2, t5 # & ~zero_mask or a0, t1, t2 # | sign ret my_clz: li t0,0 #count = 0 li t1,31 #i = 31 li t2,1 #until 1 loop: sll t6, t5, t4 # 1 << i and t6, t2, t6 # x & (1 << i) bnez t6, end addi t3, t3, 1 # count ++ addi t4, t4, -1 # i-- bgez t4, loop end: mv a0, t0 # return count ret ``` ## Leetcode: Problem 1. TwoSum I initially wanted to choose Problem A, but since it seems that defining and declaring fp16_t is a bit troublesome, I decided to start with a simpler exercise to become more familiar with RISC-V. Therefore, I chose **[Leetcode's Problem 1, TwoSum](https://leetcode.com/problems/two-sum/description/)**, as it allows me to practice using "for" and "if. ### problem #### Two Sum ##### easy Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. You may assume that each input would have exactly one solution, and you may not use the same element twice. You can return the answer in any order. Example 1: Input: nums = [2,7,11,15], target = 9 Output: [0,1] Explanation: Because nums[0] + nums[1] == 9, we return [0, 1]. Example 2: Input: nums = [3,2,4], target = 6 Output: [1,2] Example 3: Input: nums = [3,3], target = 6 Output: [0,1] ## C ```c #include <stdio.h> #include <stdlib.h> int* twoSum(int* nums, int numsSize, int target, int* returnSize) { static int result[2]; *returnSize = 2; //simple search between two array for (int i = 0; i < numsSize; i++) { for (int j = i + 1; j < numsSize; j++) { if (nums[i] + nums[j] == target) { result[0] = i; result[1] = j; } } } return result; } int main() { int* nums[3]; nums[0] = (int[]){2, 7, 11, 15}; nums[1] = (int[]){3, 2, 4}; nums[2] = (int[]){3, 3}; int numsSize[] = {4,3,2}; int target[] = {9,6,6}; for (int i =0;i<3;i++) { int returnSize; int* result = twoSum(nums[i], numsSize[i], target[i], &returnSize); printf("[%d,%d]\n", result[0], result[1]); } return 0; } ``` automate the code ```c #include <stdio.h> #include <stdlib.h> #include <stdbool.h> typedef struct { int* nums; int numsSize; int target; int expected[2]; char* description; } TestCase; int* twoSum(int* nums, int numsSize, int target, int* returnSize) { static int result[2]; *returnSize = 2; for (int i = 0; i < numsSize; i++) { for (int j = i + 1; j < numsSize; j++) { if (nums[i] + nums[j] == target) { result[0] = i; result[1] = j; } } } return result; } bool checkResult(int* result, int* expected) { return (result[0] == expected[0] && result[1] == expected[1]) || (result[0] == expected[1] && result[1] == expected[0]); } bool runTest(TestCase* test, int testNumber) { int returnSize; int* result = twoSum(test->nums, test->numsSize, test->target, &returnSize); bool passed = checkResult(result, test->expected); printf("Test %d: %s\n", testNumber + 1, test->description); printf("Input array: ["); for (int i = 0; i < test->numsSize; i++) { printf("%d%s", test->nums[i], i < test->numsSize - 1 ? ", " : ""); } printf("]\n"); printf("Target: %d\n", test->target); printf("Expected: [%d, %d]\n", test->expected[0], test->expected[1]); printf("Got: [%d, %d]\n", result[0], result[1]); printf("Status: %s\n\n", passed ? "PASSED ✓" : "FAILED ✗"); return passed; } int main() { TestCase tests[] = { { .nums = (int[]){2, 7, 11, 15}, .numsSize = 4, .target = 9, .expected = {0, 1}, .description = "Basic case with positive numbers" }, { .nums = (int[]){3, 2, 4}, .numsSize = 3, .target = 6, .expected = {1, 2}, .description = "Numbers not in sorted order" }, { .nums = (int[]){3, 3}, .numsSize = 2, .target = 6, .expected = {0, 1}, .description = "Duplicate numbers" }, { .nums = (int[]){-1, -2, -3, -4, -5}, .numsSize = 5, .target = -8, .expected = {2, 4}, .description = "Negative numbers" }, { .nums = (int[]){0, 4, 3, 0}, .numsSize = 4, .target = 0, .expected = {0, 3}, .description = "Zero sum with duplicate zeros" } }; int totalTests = sizeof(tests) / sizeof(tests[0]); int passedTests = 0; printf("Running %d tests for twoSum function...\n\n", totalTests); for (int i = 0; i < totalTests; i++) { if (runTest(&tests[i], i)) { passedTests++; } } printf("Test Summary:\n"); printf("Total tests: %d\n", totalTests); printf("Passed: %d\n", passedTests); printf("Failed: %d\n", totalTests - passedTests); printf("Success rate: %.1f%%\n", (float)passedTests / totalTests * 100); return 0; } ``` :::danger You should automate the testing procedures as much as possible, meaning there's no need to manually check each program output. Instead, implement internal validation to handle the result checking. ::: ## RISC-V ``` .data A1: .word 2,7,11,15 #numerbers1 S1: .word 4 #size of A1 T1: .word 9 #target1 A2: .word 3,2,4 #numerbers2 S2: .word 3 #size of A2 T2: .word 6 #target2 A3: .word 3,3 #numerbers3 S3: .word 2 #size of A3 T3: .word 6 #target3 str1: .string "[" str2: .string "," str3: .string "]" .text .global main main: la a5,A2 #load numbers1 in a5 lw a1,S2 #load size in a1 lw a2,T2 #load target in a2 li a3,0 #int i L1: #loop 1 bge a3,a1,end #if i>size end mv a4,a3 #int j=i addi a4,a4,1 #j = i+1 L2: #loop 2 bge a4,a1,inc #if j>size end slli t0,a3,2 #load answer[i] add t0, t0, a5 lw t1,0(t0) slli t0,a4,2 #load answer[j] add t0, t0, a5 lw t2,0(t0) add t2,t2,t1 beq t2,a2,R #compare if it equal and jump addi a4,a4,1 #j++ j L2 inc: addi a3,a3,1 #i++ j L1 R: #if numbers[i]==numbers[j] sw a3,0(sp) #answer sw a4,4(sp) j end print: lw t0,0(sp) #load answer[0] lw t1,4(sp) #load answer[1] la a0,str1 #load str1 li a7,4 #print string ecall #print mv a0,t0 #loat ansser[0] li a7,1 #print int ecall la a0,str2 li a7,4 ecall mv a0,t1 li a7,1 ecall la a0,str3 li a7,4 ecall ret end: jal ra,print #print answer li a7,10 #end ecall ``` ```diff= main: -addi sp,sp,-8 #result -sw zero,0(sp) #r[0]=0 -sw zero,4(sp) #r[1]=0 +a4,1 #j=1 L1: +slli t0,a3,2 #load answer[i] +add t0,t0,a5 +lw t1,0(t0) #t1 = number[i] +sub t2,a2,t1 #t2 = target - number[i] -mv a4,a3 #int j=i -addi a4,a4,1 #j = j+1 L2: +slli t0,a4,2 #load answer[j] +lw t3,0(t0) #t3 = numbers[j] +beq t3,t2,R -lw t1,0(t0) -lw t2,0(t0) -add t2,t2,t1 inc: +addi a4,a3,1 #j = i + 1 ``` after improvement ``` .data A1: .word 2,7,11,15 #numerbers1 S1: .word 4 #size of A1 T1: .word 9 #target1 A2: .word 3,2,4 #numerbers2 S2: .word 3 #size of A2 T2: .word 6 #target2 A3: .word 3,3 #numerbers3 S3: .word 2 #size of A3 T3: .word 6 #target3 str1: .string "[" str2: .string "," str3: .string "]" .text .global main main: la a5,A2 #load numbers1 in a5 lw a1,S2 #load size in a1 lw a2,T2 #load target in a2 li a3, 0 # i = 0 li a4, 1 # j = 1 L1: #loop 1 bge a3,a1,end #if i>size end slli t0,a3,2 #load answer[i] add t0,t0,a5 lw t1,0(t0) #t1 = numbers[i] sub t2,a2,t1 #t2 = target - number[i] L2: #loop 2 bge a4,a1,inc #if j>size end slli t0,a4,2 #load answer[j] add t0, t0, a5 lw t3,0(t0) #t3 = numbers[j] beq t3,t2,R addi a4, a4, 1 #j++ j L2 inc: addi a3,a3,1 #i++ addi a4,a3,1 #j=i+1 j L1 R: #if numbers[i]==numbers[j] sw a3,0(sp) #answer sw a4,4(sp) j end print: lw t0,0(sp) #load answer[0] lw t1,4(sp) #load answer[1] la a0,str1 #load str1 li a7,4 #print string ecall #print mv a0,t0 #loat ansser[0] li a7,1 #print int ecall la a0,str2 li a7,4 ecall mv a0,t1 li a7,1 ecall la a0,str3 li a7,4 ecall ret end: jal ra,print #print answer li a7,10 #end ecall ``` fewer instructions and automate code ``` .data test1: .word 2, 7, 11, 15 size1: .word 4 target1: .word 9 expect1: .word 0, 1 test2: .word 3, 2, 4 size2: .word 3 target2: .word 6 expect2: .word 1, 2 test3: .word 3, 3 size3: .word 2 target3: .word 6 expect3: .word 0, 1 # Output strings test_str: .string "\nTest " pass_str: .string " Passed\n" fail_str: .string " Failed\n" brack_open: .string "[" brack_close: .string "]" comma: .string ", " .text .global main # a0: array address # a1: size # a2: target # Returns result in s0 (first index) and s1 (second index) twoSum: li t0, 0 # i = 0 outer: beq t0, a1, done # if i == size, done slli t3, t0, 2 # i * 4 add t3, a0, t3 lw t4, 0(t3) # nums[i] addi t1, t0, 1 # j = i + 1 inner: beq t1, a1, next_i # if j == size, next i slli t3, t1, 2 # j * 4 add t3, a0, t3 lw t5, 0(t3) # nums[j] add t6, t4, t5 # nums[i] + nums[j] bne t6, a2, next_j # if sum != target, next j mv s0, t0 # store i mv s1, t1 # store j ret next_j: addi t1, t1, 1 # j++ j inner next_i: addi t0, t0, 1 # i++ j outer done: ret print_result: la a0, brack_open li a7, 4 ecall mv a0, s0 li a7, 1 ecall la a0, comma li a7, 4 ecall mv a0, s1 li a7, 1 ecall la a0, brack_close li a7, 4 ecall ret run_test: # Save return address addi sp, sp, -4 sw ra, 0(sp) # Run twoSum jal ra, twoSum # Print result jal ra, print_result # Compare with expected lw t0, 0(a3) # expected[0] lw t1, 4(a3) # expected[1] beq s0, t0, check_second beq s0, t1, check_first j test_fail check_second: beq s1, t1, test_pass j test_fail check_first: beq s1, t0, test_pass j test_fail test_pass: la a0, pass_str j print_result_status test_fail: la a0, fail_str print_result_status: li a7, 4 ecall # Restore return address and return lw ra, 0(sp) addi sp, sp, 4 ret main: la a0, test_str li a7, 4 ecall li a0, 1 li a7, 1 ecall la a0, test1 lw a1, size1 lw a2, target1 la a3, expect1 jal ra, run_test la a0, test_str li a7, 4 ecall li a0, 2 li a7, 1 ecall la a0, test2 lw a1, size2 lw a2, target2 la a3, expect2 jal ra, run_test la a0, test_str li a7, 4 ecall li a0, 3 li a7, 1 ecall la a0, test3 lw a1, size3 lw a2, target3 la a3, expect3 jal ra, run_test li a7, 10 ecall ``` :::danger Use fewer instructions. ::: :::danger Do not use screenshots for plain text content, as this is inaccessible to visually impaired users. ::: ## Bare C code performance with -O0 option ![o0](https://hackmd.io/_uploads/H1M8DwCCA.jpg) with -O2 option ![o2](https://hackmd.io/_uploads/r1GIvDR0R.png) wish RISC-V ![riscv](https://hackmd.io/_uploads/HkVgdDCC0.png) wish RISC-V(with first time improvement) ![improve](https://hackmd.io/_uploads/SyZDAGVykl.png)