# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < [wu81177](https://github.com/wu81177/computer_architecture_lab1) >
## Problem B in [Quiz1](https://hackmd.io/@sysprog/arch2024-quiz1-sol)
What I first learned from this code is the difference between BF16 and FP16. BF16 has an 8-bit exponent, just like FP32, which makes conversion easy.
### `bf16_to_fp32`
```c
static inline float bf16_to_fp32(bf16_t h)
{
union {
float f;
uint32_t i;
} u = {.i = (uint32_t)h.bits << 16};
return u.f;
}
```
`union` is different from `struct` in that it only occupies memory equal to its largest member, and all members share the same memory space.
This part of the assembly code is relatively simple, just shift left by 16 bits and then store it in a0 to return.
```c
bf16_to_fp32:
slli t0, a0, 16
mv a0, t0
ret
```
### `fp32_to_bf16`
```c
static inline bf16_t fp32_to_bf16(float s)
{
bf16_t h;
union {
float f;
uint32_t i;
} u = {.f = s};
if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10;
return h;
}
```
The important part is the line: `h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10;` .
This line of code implements the IEEE 754 "round to nearest, ties to even" (also known as bankers' rounding) method. If the discarded bits are exactly halfway (i.e., the 17th bit is 1), it ensures the result is rounded towards the nearest even value, reducing rounding bias in repeated calculations.
Assembly code:
```c
fp32_to_bf16:
mv t0, a0
li t1, 0x7fffffff
and t2, t0, t1
li t1, 0x7f800000
bltu t1, t2, nan_case
li t1, 0x7fff
srli t2, t0, 16
andi t3, t2, 1
add t1, t1, t3
add t0, t0, t1
srli t0, t0, 16
mv a0, t0
ret
nan_case:
srli t0, t0, 16
ori t0, t0, 64
mv a0, t0
ret
```
### Testing
This code aims to test the correctness of conversions between 32-bit floating-point (FP32) and 16-bit BF16 formats. It begins by defining three floating-point values for testing: **2.718**, **positive infinity**, and **NaN**, along with their corresponding bit patterns and expected results in both BF16 and recovered FP32 formats. In a loop, the program sequentially converts each floating-point value to BF16 and compares the conversion result with the expected BF16 bit pattern. If they match, the success is recorded, and the result is printed. Then, the BF16 value is converted back to 32-bit floating-point, and the recovered bit pattern is compared with the expected FP32 bit pattern, again recording and printing whether the comparison is successful. The 6 testing results would be stored in a array (1 for pass, 0 for fail).
```c!
int main() {
float test_values[3] = {2.718f, INFINITY, 0.0f / 0.0f}; //0x402DF3B6, 0x7F800000, 0xFFC00000
uint16_t expected_bf16[3] = {0x402E, 0x7f80, 0xFFC0};
uint32_t expected_recovered[3] = {0x402E0000, 0x7F800000, 0xFFC00000};
bool result[6] = {0};
for (int i = 0; i < 3; i++) {
float original = test_values[i];
bf16_t bf16_value = fp32_to_bf16(original);
if(bf16_value.bits == expected_bf16[i]) result[2*i] = 1;
float recovered = bf16_to_fp32(bf16_value);
if(*(uint32_t*)&recovered == expected_recovered[i]) result[2*i+1] =1;
}
for (int i = 0; i < 6; i++) printf("%d",result[i]);
return 0;
}
```
Assembly code:
```asm=
.data
test_values:
.word 0x402DF3B6, 0x7F800000, 0xFFC00000
expected_bf16:
.half 0x402E, 0x7F80, 0xFFC0
expected_recovered:
.word 0x402E0000, 0x7F800000, 0xFFC00000
result:
.byte 0, 0, 0, 0, 0, 0
.text
main:
la s0, test_values
la s1, expected_bf16
la s2, expected_recovered
la s3, result
li s4, 3 # i bound
li s5, 0 # i init
loop:
lw a0, 0(s0)
call fp32_to_bf16
lhu t1, 0(s1)
bne a0, t1, nmatch_bf16
li t1, 1
sb t1, 0(s3)
nmatch_bf16:
call bf16_to_fp32
lw t1, 0(s2)
addi s3, s3, 1
bne a0, t1, next_iteration
li t1, 1
sb t1, 0(s3)
next_iteration:
addi s0, s0, 4
addi s1, s1, 2
addi s2, s2, 4
addi s3, s3, 1
addi s5, s5, 1
blt s5, s4, loop
call print_result
li a7, 93
ecall
print_result:
la t0, result
addi t1, t0, 6
print_loop:
lb a0, 0(t0)
addi a0, a0, 48 # 0 or 1 in ASCII
li a7, 11
ecall
addi t0, t0, 1
bltu t0, t1, print_loop
ret
```
:::danger
Use fewer instructions.
:::
When rewriting this assembly code, I got stuck on the check for converting `0.0f / 0.0f` to BF16. After some observing, I realized that in line 26, the instruction `lh t1, 0(s1)` was sign-extending the upper 16 bits instead of zero-extending them. Changing it to `lhu` solved the problem.
## [Find Numbers with Even Numbers of Digits](https://leetcode.com/problems/find-numbers-with-even-number-of-digits/description/)
**Description** : Given an array `nums` of integers, return how many of them contain an **even number** of digits.
The most intuitive solution is to repeatedly divide each element of the array by 10 until it becomes less than 10 to determine its digit count. This can also be understood as taking the floor of the base-10 logarithm, encapsulated in a `log_10` function. Since RV32I lacks division instructions, I refer to the `divmod_10` function from [Linux Kernel Internals 2024 quiz3-2](https://hackmd.io/@sysprog/linux2024-quiz3#%E6%B8%AC%E9%A9%97-2) to obtain the quotient for division by 10.
Additionally, in the core `findNumbers` function, replacing `digits % 2` with `digits & 1` avoids division.
C code:
```c!
#include <stdint.h>
#include <stdio.h>
uint32_t div_10(uint32_t in)
{
uint32_t x = (in | 1) - (in >> 2); /* div = in/10 ==> div = 0.75*in/8 */
uint32_t q = (x >> 4) + x;
x = q;
q = (q >> 8) + x;
q = (q >> 8) + x;
q = (q >> 8) + x;
q = (q >> 8) + x;
uint32_t div = (q >> 3);
return div;
}
int log_10(int num) {
int log = 0;
while (num >= 10) {
uint32_t div = div_10(num);
num = div;
log++;
}
return log;
}
int findNumbers(int* nums, int numsSize) {
int count = 0;
for (int i = 0; i < numsSize; i++) {
int digits_m1 = log_10(nums[i]);
if (digits_m1 & 1) count++;
}
return count;
}
int main() {
int nums1[] = {12, 345, 2, 6, 7896};
int nums2[] = {555, 901, 482, 1771};
int nums3[] = {1234, 56789, 0, 88, 2020, 1, 12, 425, 56436, 235457, 5415, 454, 2, 0};
int expect_num_result[] = {2, 1, 6};
int num_results[3];
int numsSize1 = 5;
num_results[0] = findNumbers(nums1, numsSize1);
int numsSize2 = 4;
num_results[1] = findNumbers(nums2, numsSize2);
int numsSize3 = 14;
num_results[2] = findNumbers(nums3, numsSize3);
for (int i = 0; i < 3; i++) {
if (num_results[i] != expect_num_result[i]) {
printf("Test #%d Fail.\n",i+1);
} else {
printf("Test #%d Pass.\n",i+1);
}
}
return 0;
}
```
:::danger
Don't use `spoiler` specifiers.
:::
Assembly Code:
After the calculation, the results will be stored in the `num_results` array. They will be compared with `expect_num_result`, and the console will print whether each of the three tests passed or failed.
```asm=
.data
msg_pass: .asciz ": Test Pass.\n"
msg_fail: .asciz ": Test Fail.\n"
nums1: .word 12, 345, 2, 6, 7896
nums2: .word 555, 901, 482, 1771
nums3: .word 1234, 56789, 0, 88, 2020, 1, 12, 425, 56436, 235457, 5415, 454, 2, 0
expect_num_result: .word 2, 1, 6
num_results: .word 0, 0, 0
.text
main:
# Test 1
la a0, nums1
li a1, 5
addi sp, sp, -4
sw ra, 0(sp)
call findNumbers
lw ra, 0(sp)
addi sp, sp, 4
la t0, num_results
sw a0, 0(t0)
# Test 2
la a0, nums2
li a1, 4
addi sp, sp, -4
sw ra, 0(sp)
call findNumbers
lw ra, 0(sp)
addi sp, sp, 4
la t0, num_results
sw a0, 4(t0)
# Test 3
la a0, nums3
li a1, 14
addi sp, sp, -4
sw ra, 0(sp)
call findNumbers
lw ra, 0(sp)
addi sp, sp, 4
la t0, num_results
sw a0, 8(t0)
# Print test results
li s0, 0 # i = 0
li s1, 3 # i boundary
la s4, expect_num_result
la s5, num_results
print_loop:
bge s0, s1, end_loop
lw t0, 0(s5) # result data
lw t1, 0(s4) # expected data
beq t0, t1, pass
# fail
mv a0, s0
addi a0, a0, 1
li a7, 1
ecall
la a0, msg_fail
li a7, 4
ecall
j next
pass:
mv a0, s0
addi a0, a0, 1
li a7, 1
ecall
la a0, msg_pass
li a7, 4
ecall
j next
next:
addi s0, s0, 1
addi s4, s4, 4
addi s5, s5, 4
j print_loop
end_loop:
li a7, 93
ecall
div_10:
# x = (in | 1) - (in >> 2)
ori t0, a0, 1
srli t1, a0, 2
sub t0, t0, t1
# q = (x >> 4) + x
srli t1, t0, 4
add t0, t0, t1
mv t1, t0
# q = (q >> 8) + x (4 times)
li t3, 4
loop_q:
srli t2, t1, 8
add t1, t0, t2
addi t3, t3, -1
bnez t3, loop_q
# *div = q >> 3
srli a0, t1, 3
ret
log_10:
li s1, 10
li s4, 0 #log
while_loop:
bge a0, s1, div_step
mv a0, s4
ret
div_step:
addi sp, sp, -4
sw ra, 0(sp)
call div_10
lw ra, 0(sp)
addi sp, sp, 4
addi s4, s4, 1
j while_loop
findNumbers:
li s2, 0 # count = 0
li s0, 0 # i
mv s3, a0 # array address
find_loop:
bge s0, a1, end_fn
lw a0, 0(s3)
# digits = log_10(nums[i]) + 1
addi sp, sp, -4
sw ra, 0(sp)
call log_10
lw ra, 0(sp)
addi sp, sp, 4
# if (digits_m1 & 1) count++
andi t4, a0, 1 # t4 = digits_m1 & 1
beq t4, x0, skip_add
addi s2, s2, 1
skip_add:
addi s3, s3, 4 # address + 4
addi s0, s0, 1
j find_loop
end_fn:
mv a0, s2
ret
```
### Test result

Check the memory content of `num_result` can also confirm that it correctly holds the values 2, 1, and 6.