# Assignment1: RISC-V Assembly and Instruction Pipeline
:::danger
Choose one problem (A, B, or C) from [Quiz1](https://hackmd.io/@sysprog/arch2024-quiz1-sol), translate it from C code to a complete RISC-V assembly program, and include the relevant test data.
:::
## problem C
First, perform conversion processing on fabsf.
```c=
static inline float fabsf(float x) {
uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer
i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value
x = *(float *)&i; // Write the modified bits back into the float
return x;
}
```
```
fabsf:
li t1, 0x7FFFFFFF # mask
and a0, a0, t1 # mutiply with float a0
ret
```
Next, handle __builtin_clz.
```c=
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
```
```
my_clz:
li t0, 0 # count
li t1, 31 # i=31
li t2, 1 # for shifting
loop:
slli t3, t2, t1 # t3 = 1 << i
and t3, a0, t3 # t3 = x & (1 << i)
bnez t3, end # if t3!=0, break
addi t0, t0, 1 # count ++
addi t1, t1, -1 # i--
bgez t1, loop # loop
end:
mv a0, t0
ret
```
Finally, handle fp16_to_fp32.
```c=
static inline uint32_t fp16_to_fp32(uint16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
uint32_t renorm_shift = my_clz(nonsign);
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) &
INT32_C(0x7F800000);
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
return sign | ((((nonsign << renorm_shift >> 3) +
((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}
```
expect code
```c=
#include <stdio.h>
#include <stdint.h>
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
static inline uint32_t fp16_to_fp32(uint16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
uint32_t renorm_shift = my_clz(nonsign);
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) &
INT32_C(0x7F800000);
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
return sign | ((((nonsign << renorm_shift >> 3) +
((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}
int main()
{
unsigned short a = 0xAB00;
printf("0x%X", fp16_to_fp32(a));
return 0;
}
```
RISC-V
```
.data
num: .word 0xAB00
.text
main:
lw a0,num #load num
jal ra,fp16_to_fp32 #call fp16_to_fp32
mv a1,a0 #save result
li a0, 34 # print format "0x"
li a7, 11
ecall
mv a0,a1 #restore result
li a7,34 #print hex
ecall
li a7,10 #exit
ecall
fp16_to_fp32:
mv t0,a0 # h = num
slli t0,t0,16 # w = h<<16
li t1,0x80000000
and t1,t0,t1 # sign = w&0x80000000
li t2,0x7FFFFFFF
and t2,t0,t2 # nosign = w&07FFFFFFF
# Save registers before calling my_clz
addi sp, sp, -8
sw ra, 4(sp)
sw t1, 0(sp) # save sign
mv a0, t2 # prepare nonsign as argument for my_clz
jal ra,my_clz #call my_clz
# Restore sign
lw t1, 0(sp)
lw ra, 4(sp)
addi sp, sp, 8
mv t3, a0 # renorm_shift = clz result
addi t3, t3, -5 # renorm_shift -= 5
bgt t3, zero, skip_reset
li t3, 0 # if renorm_shift <= 0, set to 0
skip_reset:
li t4, 0x04000000
add t4, t2, t4 # nonsign + 0x04000000
srli t4, t4, 8 # >> 8
li t5, 0x7F800000
and t4, t4, t5 # inf_nan_mask
addi t5, t2, -1 # nonsign - 1
srli t5, t5, 31 # zero_mask = (nonsign-1)>>31
not t5, t5 # ~zero_mask
sll t2, t2, t3 # nonsign << renorm_shift
srli t2, t2, 3 # >> 3
li t6, 0x70
sub t3, t6, t3 # 0x70 - renorm_shift
slli t3, t3, 23 # << 23
add t2, t2, t3 # add shifted values
or t2, t2, t4 # | inf_nan_mask
and t2, t2, t5 # & ~zero_mask
or a0, t1, t2 # | sign
ret
my_clz:
li t0,0 #count = 0
li t1,31 #i = 31
li t2,1 #until 1
loop:
sll t6, t5, t4 # 1 << i
and t6, t2, t6 # x & (1 << i)
bnez t6, end
addi t3, t3, 1 # count ++
addi t4, t4, -1 # i--
bgez t4, loop
end:
mv a0, t0 # return count
ret
```
## Leetcode: Problem 1. TwoSum
I initially wanted to choose Problem A, but since it seems that defining and declaring fp16_t is a bit troublesome, I decided to start with a simpler exercise to become more familiar with RISC-V. Therefore, I chose **[Leetcode's Problem 1, TwoSum](https://leetcode.com/problems/two-sum/description/)**, as it allows me to practice using "for" and "if.
### problem
#### Two Sum
##### easy
Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.
You may assume that each input would have exactly one solution, and you may not use the same element twice.
You can return the answer in any order.
Example 1:
Input: nums = [2,7,11,15], target = 9
Output: [0,1]
Explanation: Because nums[0] + nums[1] == 9, we return [0, 1].
Example 2:
Input: nums = [3,2,4], target = 6
Output: [1,2]
Example 3:
Input: nums = [3,3], target = 6
Output: [0,1]
## C
```c
#include <stdio.h>
#include <stdlib.h>
int* twoSum(int* nums, int numsSize, int target, int* returnSize) {
static int result[2];
*returnSize = 2;
//simple search between two array
for (int i = 0; i < numsSize; i++) {
for (int j = i + 1; j < numsSize; j++) {
if (nums[i] + nums[j] == target) {
result[0] = i;
result[1] = j;
}
}
}
return result;
}
int main() {
int* nums[3];
nums[0] = (int[]){2, 7, 11, 15};
nums[1] = (int[]){3, 2, 4};
nums[2] = (int[]){3, 3};
int numsSize[] = {4,3,2};
int target[] = {9,6,6};
for (int i =0;i<3;i++)
{
int returnSize;
int* result = twoSum(nums[i], numsSize[i], target[i], &returnSize);
printf("[%d,%d]\n", result[0], result[1]);
}
return 0;
}
```
automate the code
```c
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
typedef struct {
int* nums;
int numsSize;
int target;
int expected[2];
char* description;
} TestCase;
int* twoSum(int* nums, int numsSize, int target, int* returnSize) {
static int result[2];
*returnSize = 2;
for (int i = 0; i < numsSize; i++) {
for (int j = i + 1; j < numsSize; j++) {
if (nums[i] + nums[j] == target) {
result[0] = i;
result[1] = j;
}
}
}
return result;
}
bool checkResult(int* result, int* expected) {
return (result[0] == expected[0] && result[1] == expected[1]) ||
(result[0] == expected[1] && result[1] == expected[0]);
}
bool runTest(TestCase* test, int testNumber) {
int returnSize;
int* result = twoSum(test->nums, test->numsSize, test->target, &returnSize);
bool passed = checkResult(result, test->expected);
printf("Test %d: %s\n", testNumber + 1, test->description);
printf("Input array: [");
for (int i = 0; i < test->numsSize; i++) {
printf("%d%s", test->nums[i], i < test->numsSize - 1 ? ", " : "");
}
printf("]\n");
printf("Target: %d\n", test->target);
printf("Expected: [%d, %d]\n", test->expected[0], test->expected[1]);
printf("Got: [%d, %d]\n", result[0], result[1]);
printf("Status: %s\n\n", passed ? "PASSED ✓" : "FAILED ✗");
return passed;
}
int main() {
TestCase tests[] = {
{
.nums = (int[]){2, 7, 11, 15},
.numsSize = 4,
.target = 9,
.expected = {0, 1},
.description = "Basic case with positive numbers"
},
{
.nums = (int[]){3, 2, 4},
.numsSize = 3,
.target = 6,
.expected = {1, 2},
.description = "Numbers not in sorted order"
},
{
.nums = (int[]){3, 3},
.numsSize = 2,
.target = 6,
.expected = {0, 1},
.description = "Duplicate numbers"
},
{
.nums = (int[]){-1, -2, -3, -4, -5},
.numsSize = 5,
.target = -8,
.expected = {2, 4},
.description = "Negative numbers"
},
{
.nums = (int[]){0, 4, 3, 0},
.numsSize = 4,
.target = 0,
.expected = {0, 3},
.description = "Zero sum with duplicate zeros"
}
};
int totalTests = sizeof(tests) / sizeof(tests[0]);
int passedTests = 0;
printf("Running %d tests for twoSum function...\n\n", totalTests);
for (int i = 0; i < totalTests; i++) {
if (runTest(&tests[i], i)) {
passedTests++;
}
}
printf("Test Summary:\n");
printf("Total tests: %d\n", totalTests);
printf("Passed: %d\n", passedTests);
printf("Failed: %d\n", totalTests - passedTests);
printf("Success rate: %.1f%%\n", (float)passedTests / totalTests * 100);
return 0;
}
```
:::danger
You should automate the testing procedures as much as possible, meaning there's no need to manually check each program output. Instead, implement internal validation to handle the result checking.
:::
## RISC-V
```
.data
A1: .word 2,7,11,15 #numerbers1
S1: .word 4 #size of A1
T1: .word 9 #target1
A2: .word 3,2,4 #numerbers2
S2: .word 3 #size of A2
T2: .word 6 #target2
A3: .word 3,3 #numerbers3
S3: .word 2 #size of A3
T3: .word 6 #target3
str1: .string "["
str2: .string ","
str3: .string "]"
.text
.global main
main:
la a5,A2 #load numbers1 in a5
lw a1,S2 #load size in a1
lw a2,T2 #load target in a2
li a3,0 #int i
L1:
#loop 1
bge a3,a1,end #if i>size end
mv a4,a3 #int j=i
addi a4,a4,1 #j = i+1
L2:
#loop 2
bge a4,a1,inc #if j>size end
slli t0,a3,2 #load answer[i]
add t0, t0, a5
lw t1,0(t0)
slli t0,a4,2 #load answer[j]
add t0, t0, a5
lw t2,0(t0)
add t2,t2,t1
beq t2,a2,R #compare if it equal and jump
addi a4,a4,1 #j++
j L2
inc:
addi a3,a3,1 #i++
j L1
R:
#if numbers[i]==numbers[j]
sw a3,0(sp) #answer
sw a4,4(sp)
j end
print:
lw t0,0(sp) #load answer[0]
lw t1,4(sp) #load answer[1]
la a0,str1 #load str1
li a7,4 #print string
ecall #print
mv a0,t0 #loat ansser[0]
li a7,1 #print int
ecall
la a0,str2
li a7,4
ecall
mv a0,t1
li a7,1
ecall
la a0,str3
li a7,4
ecall
ret
end:
jal ra,print #print answer
li a7,10 #end
ecall
```
```diff=
main:
-addi sp,sp,-8 #result
-sw zero,0(sp) #r[0]=0
-sw zero,4(sp) #r[1]=0
+a4,1 #j=1
L1:
+slli t0,a3,2 #load answer[i]
+add t0,t0,a5
+lw t1,0(t0) #t1 = number[i]
+sub t2,a2,t1 #t2 = target - number[i]
-mv a4,a3 #int j=i
-addi a4,a4,1 #j = j+1
L2:
+slli t0,a4,2 #load answer[j]
+lw t3,0(t0) #t3 = numbers[j]
+beq t3,t2,R
-lw t1,0(t0)
-lw t2,0(t0)
-add t2,t2,t1
inc:
+addi a4,a3,1 #j = i + 1
```
after improvement
```
.data
A1: .word 2,7,11,15 #numerbers1
S1: .word 4 #size of A1
T1: .word 9 #target1
A2: .word 3,2,4 #numerbers2
S2: .word 3 #size of A2
T2: .word 6 #target2
A3: .word 3,3 #numerbers3
S3: .word 2 #size of A3
T3: .word 6 #target3
str1: .string "["
str2: .string ","
str3: .string "]"
.text
.global main
main:
la a5,A2 #load numbers1 in a5
lw a1,S2 #load size in a1
lw a2,T2 #load target in a2
li a3, 0 # i = 0
li a4, 1 # j = 1
L1:
#loop 1
bge a3,a1,end #if i>size end
slli t0,a3,2 #load answer[i]
add t0,t0,a5
lw t1,0(t0) #t1 = numbers[i]
sub t2,a2,t1 #t2 = target - number[i]
L2:
#loop 2
bge a4,a1,inc #if j>size end
slli t0,a4,2 #load answer[j]
add t0, t0, a5
lw t3,0(t0) #t3 = numbers[j]
beq t3,t2,R
addi a4, a4, 1 #j++
j L2
inc:
addi a3,a3,1 #i++
addi a4,a3,1 #j=i+1
j L1
R:
#if numbers[i]==numbers[j]
sw a3,0(sp) #answer
sw a4,4(sp)
j end
print:
lw t0,0(sp) #load answer[0]
lw t1,4(sp) #load answer[1]
la a0,str1 #load str1
li a7,4 #print string
ecall #print
mv a0,t0 #loat ansser[0]
li a7,1 #print int
ecall
la a0,str2
li a7,4
ecall
mv a0,t1
li a7,1
ecall
la a0,str3
li a7,4
ecall
ret
end:
jal ra,print #print answer
li a7,10 #end
ecall
```
fewer instructions and automate code
```
.data
test1: .word 2, 7, 11, 15
size1: .word 4
target1: .word 9
expect1: .word 0, 1
test2: .word 3, 2, 4
size2: .word 3
target2: .word 6
expect2: .word 1, 2
test3: .word 3, 3
size3: .word 2
target3: .word 6
expect3: .word 0, 1
# Output strings
test_str: .string "\nTest "
pass_str: .string " Passed\n"
fail_str: .string " Failed\n"
brack_open: .string "["
brack_close: .string "]"
comma: .string ", "
.text
.global main
# a0: array address
# a1: size
# a2: target
# Returns result in s0 (first index) and s1 (second index)
twoSum:
li t0, 0 # i = 0
outer:
beq t0, a1, done # if i == size, done
slli t3, t0, 2 # i * 4
add t3, a0, t3
lw t4, 0(t3) # nums[i]
addi t1, t0, 1 # j = i + 1
inner:
beq t1, a1, next_i # if j == size, next i
slli t3, t1, 2 # j * 4
add t3, a0, t3
lw t5, 0(t3) # nums[j]
add t6, t4, t5 # nums[i] + nums[j]
bne t6, a2, next_j # if sum != target, next j
mv s0, t0 # store i
mv s1, t1 # store j
ret
next_j:
addi t1, t1, 1 # j++
j inner
next_i:
addi t0, t0, 1 # i++
j outer
done:
ret
print_result:
la a0, brack_open
li a7, 4
ecall
mv a0, s0
li a7, 1
ecall
la a0, comma
li a7, 4
ecall
mv a0, s1
li a7, 1
ecall
la a0, brack_close
li a7, 4
ecall
ret
run_test:
# Save return address
addi sp, sp, -4
sw ra, 0(sp)
# Run twoSum
jal ra, twoSum
# Print result
jal ra, print_result
# Compare with expected
lw t0, 0(a3) # expected[0]
lw t1, 4(a3) # expected[1]
beq s0, t0, check_second
beq s0, t1, check_first
j test_fail
check_second:
beq s1, t1, test_pass
j test_fail
check_first:
beq s1, t0, test_pass
j test_fail
test_pass:
la a0, pass_str
j print_result_status
test_fail:
la a0, fail_str
print_result_status:
li a7, 4
ecall
# Restore return address and return
lw ra, 0(sp)
addi sp, sp, 4
ret
main:
la a0, test_str
li a7, 4
ecall
li a0, 1
li a7, 1
ecall
la a0, test1
lw a1, size1
lw a2, target1
la a3, expect1
jal ra, run_test
la a0, test_str
li a7, 4
ecall
li a0, 2
li a7, 1
ecall
la a0, test2
lw a1, size2
lw a2, target2
la a3, expect2
jal ra, run_test
la a0, test_str
li a7, 4
ecall
li a0, 3
li a7, 1
ecall
la a0, test3
lw a1, size3
lw a2, target3
la a3, expect3
jal ra, run_test
li a7, 10
ecall
```
:::danger
Use fewer instructions.
:::
:::danger
Do not use screenshots for plain text content, as this is inaccessible to visually impaired users.
:::
## Bare C code performance
with -O0 option

with -O2 option

wish RISC-V

wish RISC-V(with first time improvement)
