# Assignment2: Complete Applications
contributed by < [JimmyCh1025](https://github.com/JimmyCh1025) >
[TOC]
###### tags: `RISC-V` `RV32emu` `Computer Architure 2025`
## Problem [Leetcode 3370. Smallest Number With All Set Bits](https://leetcode.com/problems/smallest-number-with-all-set-bits/description/)
This code has already finished unrolling before I finished HW1.
### C Program
```c=
#include <stdio.h>
#include <stdint.h>
static inline unsigned clz(uint32_t x)
{
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) {
n -= c;
x = y;
}
c >>= 1;
} while (c);
return n - x;
}
int smallestNumber(int n) {
int bit_len = (1 << (32-clz(n)))-1;
return bit_len;
}
int main()
{
int input[] = {1, 509, 1000}, output;
int ans[] = {1, 511, 1023};
for (int i = 0 ; i < 3 ; ++i)
{
output = smallestNumber(input[i]);
printf("output = %d, answer = %d\n", output, ans[i]);
if (output == ans[i])
printf("True\n");
else
printf("False\n");
}
return 0;
}
```
### Original assembly(run in Ripes)
:::spoiler More detailed information
```assembly=
.data
input:
.word 1, 509, 1000
ans:
.word 1, 511, 1023
output_str:
.string "output = "
answer_str:
.string ", answer = "
endline_str:
.string "\n"
true_str:
.string "True\n"
false_str:
.string "False\n"
.text
.global main
main:
# load input array
la s0, input
# load answer array
la s1, ans
# i = 0
li t0, 0
# for loop boundary 3
li t1, 3
main_for:
# for (int i = 0 ; i < 3 ; ++i)
bge t0, t1, main_exit
# load input[i] to a0
slli t2, t0, 2
add t3, s0, t2
lw a0, 0(t3)
# call smallestNumber
addi sp, sp, -16
sw ra, 12(sp)
sw t2, 8(sp)
sw t1, 4(sp)
sw t0, 0(sp)
jal ra, smallestNumber
# output = smallestNumber(input[i])
add s2, a0, x0
lw t0, 0(sp)
lw t1, 4(sp)
lw t2, 8(sp)
lw ra, 12(sp)
addi sp, sp, 16
j print_result
print_result:
# print "output = "
la a0, output_str
li a7, 4
ecall
# print output
addi a0, s2, 0
li a7, 1
ecall
# ", answer = "
la a0, answer_str
li a7, 4
ecall
# load ans[i] to t3
# print answer
add t2, s1, t2
lw a0, 0(t2)
add t2, a0, x0
li a7, 1
ecall
# print "\n"
la a0, endline_str
li a7, 4
ecall
# if (output[i] == ans[i])
beq s2, t2, print_true
# else
j print_false
print_true:
# print "True\n"
la a0, true_str
li a7, 4
ecall
# ++i
addi t0, t0, 1
j main_for
print_false:
# print "False\n"
la a0, false_str
li a7, 4
ecall
# ++i
addi t0, t0, 1
j main_for
smallestNumber:
addi sp, sp, -12
sw ra, 8(sp)
sw t0, 4(sp)
sw t1, 0(sp)
# call clz
addi sp, sp, -8 # store ra, a0(n)
sw ra, 4(sp)
sw a0, 0(sp)
jal ra, clz
li t0, 32 # bit_len = 32
li t1, 1
sub t0, t0, a0 # bit_len = 32 - clz(n)
lw a0, 0(sp)
lw ra, 4(sp)
addi sp, sp, 8
sll t1, t1, t0
addi a0, t1, -1
lw t1, 0(sp)
lw t0, 4(sp)
lw ra, 8(sp)
addi sp, sp, 12
jalr x0, x1, 0
clz:
li t0, 32 # t0 = n
li t1, 16 # t1 = c
j clz_whileLoop
clz_whileLoop:
srl t2, a0, t1 # y = t2, y = x >> c
beq t2, x0, shift_right_1bit # if y == 0, jump to shift_right_1bit
sub t0, t0, t1 # n = n - c
addi a0, t2, 0 # x = y
j shift_right_1bit
shift_right_1bit:
srli t1, t1, 1 # c = c >> 1
bne t1, x0, clz_whileLoop # if c != 0, jump to clz_whileLoop
sub a0, t0, a0 # x = n - x
jalr x0, x1, 0
main_exit:
li a7, 10
ecall
```
:::
### Modify assembly(run in rv32emu)
Since the previous code cannot run in rv32emu, I modified my code and ran it in rv32emu.
:::spoiler More detailed information
```assembly=
.set STDOUT, 1
.set SYSREAD, 63
.set SYSWRITE, 64
.set SYSEXIT, 93
.data
input:
.word 1, 509, 1000
ans:
.word 1, 511, 1023
output_str:
.string "output = "
answer_str:
.string ", answer = "
endline_str:
.string "\n"
true_str:
.string "True\n"
false_str:
.string "False\n"
.lcomm buffer, 12
.text
.global main
main:
# load input array
la s0, input
# load answer array
la s1, ans
main_for:
# loop unrolling
# load input[0] to a0
addi t3, s0, 0
lw a0, 0(t3)
# call smallestNumber
addi sp, sp, -4
sw ra, 0(sp)
jal ra, smallestNumber
# output = smallestNumber(input[0])
add s2, a0, x0
# load input[1] to a0
addi t3, s0, 4
lw a0, 0(t3)
# call smallestNumber
jal ra, smallestNumber
# output = smallestNumber(input[1])
add s3, a0, x0
# load input[2] to a0
addi t3, s0, 8
lw a0, 0(t3)
# call smallestNumber
jal ra, smallestNumber
# output = smallestNumber(input[1])
add s4, a0, x0
lw ra, 0(sp)
addi sp, sp, 4
j print_result_0
print_result_0:
# print "output = "
li a0, STDOUT
la a1, output_str
li a2, 9
li a7, SYSWRITE
ecall
# print output
addi t0, s2, 0
la a1, buffer
call num_to_str
li a0, STDOUT
la a1, buffer
li a2, 12
# add a2, t4, x0
li a7, SYSWRITE
ecall
# ", answer = "
li a0, STDOUT
la a1, answer_str
li a2, 11
li a7, SYSWRITE
ecall
# load ans[0] to t3
# print answer
addi t2, s1, 0
lw a0, 0(t2)
addi t0, a0, 0
la a1, buffer
call num_to_str
# reorder li and add
li a0, STDOUT
la a1, buffer
li a2, 12
li a7, SYSWRITE
addi t2, s1, 0
lw t2, 0(t2)
ecall
# print "\n"
li a0, STDOUT
la a1, endline_str
li a2, 1
li a7, SYSWRITE
ecall
# if (output[i] == ans[i])
beq s2, t2, print_true_0
# else
j print_false_0
print_true_0:
# print "True\n"
li a0, STDOUT
la a1, true_str
li a2, 5
li a7, SYSWRITE
ecall
j print_result_1
print_false_0:
# print "False\n"
li a0, STDOUT
la a1, false_str
li a2, 6
li a7, SYSWRITE
ecall
j print_result_1
print_result_1:
# print "output = "
li a0, STDOUT
la a1, output_str
li a2, 9
li a7, SYSWRITE
ecall
# print output
addi t0, s3, 0
la a1, buffer
call num_to_str
li a0, STDOUT
la a1, buffer
li a2, 12
li a7, SYSWRITE
ecall
# ", answer = "
li a0, STDOUT
la a1, answer_str
li a2, 11
li a7, SYSWRITE
ecall
# load ans[0] to t3
# print answer
addi t2, s1, 4
lw a0, 0(t2)
addi t0, a0, 0
la a1, buffer
call num_to_str
# reorder li and add
li a0, STDOUT
la a1, buffer
li a2, 12
li a7, SYSWRITE
addi t2, s1, 4
lw t2, 0(t2)
ecall
# print "\n"
li a0, STDOUT
la a1, endline_str
li a2, 1
li a7, SYSWRITE
ecall
# if (output == ans[1])
beq s3, t2, print_true_1
# else
j print_false_1
print_true_1:
# print "True\n"
li a0, STDOUT
la a1, true_str
li a2, 5
li a7, SYSWRITE
ecall
j print_result_2
print_false_1:
# print "False\n"
li a0, STDOUT
la a1, false_str
li a2, 6
li a7, SYSWRITE
ecall
j print_result_2
print_result_2:
# print "output = "
li a0, STDOUT
la a1, output_str
li a2, 9
li a7, SYSWRITE
ecall
# print output
addi t0, s4, 0
la a1, buffer
call num_to_str
li a0, STDOUT
la a1, buffer
li a2, 12
li a7, SYSWRITE
ecall
# ", answer = "
li a0, STDOUT
la a1, answer_str
li a2, 11
li a7, SYSWRITE
ecall
# load ans[0] to t3
# print answer
addi t2, s1, 8
lw a0, 0(t2)
addi t0, a0, 0
la a1, buffer
call num_to_str
# reorder li and add
li a0, STDOUT
la a1, buffer
li a2, 12
li a7, SYSWRITE
addi t2, s1, 8
lw t2, 0(t2)
ecall
# print "\n"
li a0, STDOUT
la a1, endline_str
li a2, 1
li a7, SYSWRITE
ecall
# if (output == ans[2])
beq s4, t2, print_true_2
# else
j print_false_2
print_true_2:
# print "True\n"
li a0, STDOUT
la a1, true_str
li a2, 5
li a7, SYSWRITE
ecall
j main_exit
print_false_2:
# print "False\n"
li a0, STDOUT
la a1, false_str
li a2, 6
li a7, SYSWRITE
ecall
j main_exit
smallestNumber:
addi sp, sp, -12
sw ra, 8(sp)
sw t0, 4(sp)
sw t1, 0(sp)
# call clz
addi sp, sp, -8 # store ra, a0(n)
sw ra, 4(sp)
sw a0, 0(sp)
jal ra, clz
li t0, 32 # bit_len = 32
li t1, 1
sub t0, t0, a0 # bit_len = 32 - clz(n)
lw a0, 0(sp)
lw ra, 4(sp)
addi sp, sp, 8
sll t1, t1, t0
addi a0, t1, -1
lw t1, 0(sp)
lw t0, 4(sp)
lw ra, 8(sp)
addi sp, sp, 12
jalr x0, x1, 0
clz:
li t0, 32 # t0 = n
li t1, 16 # t1 = c
j clz_whileLoop
clz_whileLoop:
srl t2, a0, t1 # y = t2, y = x >> c
beq t2, x0, shift_right_1bit # if y == 0, jump to shift_right_1bit
sub t0, t0, t1 # n = n - c
addi a0, t2, 0 # x = y
j shift_right_1bit
shift_right_1bit:
srli t1, t1, 1 # c = c >> 1
bne t1, x0, clz_whileLoop # if c != 0, jump to clz_whileLoop
sub a0, t0, a0 # x = n - x
jalr x0, x1, 0
num_to_str:
# t1 is string index
addi t1, a1, 11
beqz t0, num_to_str_zero_case
j num_to_str_loop
num_to_str_loop:
li t2, 10
li t3, 0
j num_to_str_div10
num_to_str_div10:
# if t0 >= 10, t0 - 10
bge t0, t2, num_to_str_sub10
j num_to_str_div_done
num_to_str_sub10:
sub t0, t0, t2
addi t3, t3, 1
j num_to_str_div10
num_to_str_div_done:
# point to next string index
addi t0, t0, '0'
sb t0, 0(t1)
addi t0, t3, 0
addi t1, t1, -1
bnez t0, num_to_str_loop
j num_to_str_done
num_to_str_zero_case:
li t3, '0'
sb t3, 0(t1)
addi t1, t1, -1
j num_to_str_done
num_to_str_done:
# put "\0" on the string end
sb zero, 0(t1)
addi t1, t1, -1
ret
main_exit:
li a0, 0
li a7, SYSEXIT
ecall
```
:::

## Problem A in quiz2
### Assembly code
:::spoiler More detailed information
```=
.equ STDOUT, 1
.equ WRITE, 64
.equ EXIT, 93
.text
.globl hanoi
hanoi:
addi x2, x2, -36
sw x8, 0(x2)
sw x9, 4(x2)
sw x18, 8(x2)
sw x19, 12(x2)
sw x20, 16(x2)
li x5, 0x15
sw x5, 20(x2)
sw x5, 24(x2)
sw x5, 28(x2)
sw x1, 32(x2)
# Fix disk positions (BLANK 1-3: neutralize x5 effect)
# BLANK 1: Fix position at x2+20
sw x0, 20(x2)
# BLANK 2: Fix position at x2+24
sw x0, 24(x2)
# BLANK 3: Fix position at x2+28
sw x0, 28(x2)
addi x8, x0, 1
game_loop:
# BLANK 4: Check loop termination (2^3 moves)
addi x5, x0, 8
beq x8, x5, finish_game
# Gray code formula: gray(n) = n XOR (n >> k)
# BLANK 5: What is k for Gray code?
srli x5, x8, 1
# BLANK 6: Complete Gray(n) calculation
xor x6, x8, x5
# BLANK 7-8: Calculate previous value and its shift
addi x7, x8, -1
srli x28, x7, 1
# BLANK 9: Generate Gray(n-1)
xor x7, x7, x28
# BLANK 10: Which bits changed?
xor x5, x6, x7
# Initialize disk number
addi x9, x0, 0
# BLANK 11: Mask for testing LSB
andi x6, x5, 1
# BLANK 12: Branch if disk 0 moves
bne x6, x0, disk_found
# BLANK 13: Set disk 1
addi x9, x0, 1
# BLANK 14: Test second bit with proper mask
andi x6, x5, 2
bne x6, x0, disk_found
# BLANK 15: Last disk number
addi x9, x0, 2
disk_found:
# BLANK 16: Check impossible pattern (multiple bits)
andi x30, x5, 5
addi x31, x0, 5
beq x30, x31, pattern_match
jal x0, continue_move
pattern_match:
continue_move:
# BLANK 17: Word-align disk index (multiply by what?)
slli x5, x9, 2
# BLANK 18: Base offset for disk array
addi x5, x5, 20
add x5, x2, x5
lw x18, 0(x5)
bne x9, x0, handle_large
# BLANK 19: Small disk moves by how many positions?
addi x19, x18, 2
# BLANK 20: Number of pegs
addi x6, x0, 3
blt x19, x6, display_move
sub x19, x19, x6
jal x0, display_move
handle_large:
# BLANK 21: Load reference disk position
lw x6, 20(x2)
# BLANK 22: Sum of all peg indices (0+1+2)
addi x19, x0, 3
sub x19, x19, x18
sub x19, x19, x6
display_move:
la x20, obdata
add x5, x20, x18
# BLANK 23: Load byte from obfuscated data
lbu x11, 0(x5)
# BLANK 24: Decode constant (0x6F)
li x6, 0x6F
xor x11, x11, x6
# BLANK 25: Final offset adjustment
addi x11, x11, -0x12
add x7, x20, x19
lbu x12, 0(x7)
xor x12, x12, x6
addi x12, x12, -0x12
addi x30, x11, 0
addi x31, x12, 0
li x10, STDOUT
la x11, str1
li x12, 10
li x17, WRITE
ecall
li x10, STDOUT
addi x11, x9, 49
sb x11, 0(x11)
li x12, 1
li x17, WRITE
ecall
li x10, STDOUT
la x11, str2
li x12, 6
li x17, WRITE
ecall
li x10, STDOUT
addi x11, x30, 0
sb x11, 0(x11)
li x12, 1
li x17, WRITE
ecall
li x10, STDOUT
la x11, str3
li x12, 4
li x17, WRITE
ecall
li x10, STDOUT
addi x11, x31, 0
sb x11, 0(x11)
li x12, 1
li x17, WRITE
ecall
li x10, STDOUT
addi x11, x0, 10
sb x11, 0(x11)
li x12, 1
li x17, WRITE
ecall
# BLANK 26: Calculate storage offset
slli x5, x9, 2
addi x5, x5, 20
add x5, x2, x5
# BLANK 27: Update disk position
sw x19, 0(x5)
# BLANK 28-29: Increment counter and loop
addi x8, x8, 1
jal x0, game_loop
finish_game:
lw x8, 0(x2)
lw x9, 4(x2)
lw x18, 8(x2)
lw x19, 12(x2)
lw x20, 16(x2)
lw x1, 32(x2)
addi x2, x2, 36
ret
.data
obdata: .byte 0x3c, 0x3b, 0x3a
str1: .asciz "Move Disk "
str2: .asciz " from "
str3: .asciz " to "
```
:::

## Problem C in quiz3
Rsqrt : 1/sqrt(x)
- Lookup Table: Provides a fast initial estimate based on the input's magnitude.
- Linear Interpolation: Improves accuracy for non-exact powers of 2 by interpolating between table entries.
- Newton-Raphson Iteration: Refines the estimate using an efficient iterative method, yielding high precision.
### C code
:::spoiler More detailed information
```=
static const uint16_t rsqrt_table[32] = {
65535, 46341, 32768, 23170, 16384, /* 2^0 to 2^4 */
11585, 8192, 5793, 4096, 2896, /* 2^5 to 2^9 */
2048, 1448, 1024, 724, 512, /* 2^10 to 2^14 */
362, 256, 181, 128, 90, /* 2^15 to 2^19 */
64, 45, 32, 23, 16, /* 2^20 to 2^24 */
11, 8, 6, 4, 3, /* 2^25 to 2^29 */
2, 1 /* 2^30, 2^31 */
};
static void print_float(unsigned long val)
{
char buf[20];
char *p = buf + sizeof(buf) - 1;
uint32_t cnt = 0;
*p = '\n';
p--;
if (val == 0) {
*p = '0';
p--;
} else {
while (val > 0)
{
*p = '0' + umod(val, 10);
p--;
val = udiv(val, 10);
cnt++;
}
}
while (cnt < 5)
{
*p = '0';
p--;
++cnt;
}
*p = '.';
p--;
*p = '0';
p--;
p++;
printstr(p, (buf + sizeof(buf) - p));
}
static uint64_t mul32(uint32_t a, uint32_t b)
{
uint64_t r = 0;
for (int i = 0 ; i < 32 ; ++i)
{
if (b & (1UL << i))
{
r += (uint64_t)a << i;
}
}
return r;
}
static uint32_t FloatToInt(uint32_t y)
{
uint32_t val = 50000, ans = 0;
for (int i = 15 ; i >= 0 ; --i)
{
if (((y >> i)&1) == 1)
{
ans = ans + val;
}
val >>= 1;
}
return ans;
}
static void Newton_Raphson(void)
{
uint32_t x, y, exp;
uint32_t y_base, y_next;
uint64_t y2, xy2;
uint32_t fraction;
for (uint32_t i = 0 ; i < 32 ; i+=2)
{
x = (1<<i)+i;
exp = 31 - clz(x);
/* x is between 2^exp and 2^(exp+1) */
y_base = rsqrt_table[exp]; /* Value at 2^exp */
y_next = rsqrt_table[exp + 1]; /* Value at 2^(exp+1) */
/* fraction: how far x is between 2^exp and 2^(exp+1) */
fraction = (x - (1<<exp)) >> exp;
/* Linear interpolation */
y = y_base - mul32((y_base - y_next), fraction);
TEST_LOGGER("X = ");
print_dec((unsigned long) x);
// TEST_LOGGER("Y = ");
// print_dec((unsigned long) y);
for (uint32_t j = 0 ; j < 2 ; ++j)
{
y2 = mul32(y, y);
// TEST_LOGGER("Y2 = ");
// print_dec((unsigned long) y2);
xy2 = mul32(x,y2) >> 16;
// TEST_LOGGER("XY2 = ");
// print_dec((unsigned long) xy2);
y = (mul32(y, ((3<<16) - xy2))) >> 17 ; /* Complete Newton step */
// TEST_LOGGER("Y = ");
// print_dec((unsigned long) y);
}
// TEST_LOGGER("Y = ");
// print_dec((unsigned long) y);
TEST_LOGGER("rsqrt = ");
print_float((unsigned long) FloatToInt(y));
TEST_LOGGER("\n");
}
}
```
:::
### Redo assembly code
I initially thought multiplication wouldn't exceed 32 bits, but after reviewing the documentation, I discovered that multiplication by multiplication would result in overflow, requiring two temporary registers, so the modification isn't finished yet.
:::spoiler More detailed information
```assembly=
.equ STDOUT, 1
.equ WRITE, 64
.equ FP_EXP_BIAS, 127
.equ FP_NAN, 0x7FC00000
.equ FP_ZERO, 0x00000000
.data
strX: .asciz "X = "
strY: .asciz ",Y = "
strEnd: .asciz "\n"
rsqrt_table:
.word 65535, 46341, 32768, 23170, 16384, 11585, 8192, 5793, 4096, 2896, 2048, 1448, 1024, 724, 512, 362, 256, 181, 128, 90, 64, 45, 32, 23, 16, 11, 8, 6, 4, 3, 2, 1
.lcomm buffer, 32
.text
.globl nrRsqrt
nrRsqrt:
addi sp, sp, -4
sw ra, 0(sp)
# set x = t3
add t3, a0, x0
j find_msb
find_msb:
# t4 = x
add t4, t3, x0
# t5 = msb index
addi t5, x0, -1
j find_msb_loop
find_msb_loop:
beq t4, x0, newton_raphson
# t4/2
srli t4, t4, 1
# index + 1
addi t5, t5, 1
j find_msb_loop
newton_raphson:
# y = rsqrt_table[msb_x]
la t1, rsqrt_table
slli t2, t5, 2
add t1, t1, t2
lw t4, 0(t1)
# call print_float(y)
# put char to buffer
la a1, buffer
addi sp, sp, -4
sw ra, 0(sp)
add a0, t4, x0
jal intToStr
lw ra, 0(sp)
addi sp, sp, 4
# print buffer
li a0, STDOUT
la a1, buffer
li a2, 32
li a7, WRITE
ecall
# printf("\n")
li a0, STDOUT
la a1, strEnd
li a2, 1
li a7, WRITE
ecall
# converge 4 times
addi t5, x0, 4
j newton_raphson_loop
newton_raphson_loop:
# assign a0 = y, a1 = y
add a0, x0, t4
add a1, x0, t4
# call mul(y, y)
addi sp, sp, -4
sw ra, 0(sp)
jal ra, mul
lw ra, 0(sp)
addi sp, sp, 4
# return y*y value to t2
add t2, a0, x0
# assign a0 = x, a1 = t2
add a0, x0, t3
add a1, x0, t2
# call mul(x, y^2)
addi sp, sp, -4
sw ra, 0(sp)
jal ra, mul
lw ra, 0(sp)
addi sp, sp, 4
# return x*y*y value to t2
add t2, a0, x0
# t2 = (x*y^2) >> 16
srli t2, t2, 16
# set t1 = 3
li t1, 3
# t1 = 3 << 16
slli t1, t1, 16
# (3<<16) - x*y^2
sub t2, t1, t2
# assign a0 = y, a1 = t2
add a0, x0, t4
add a1, x0, t2
# call mul(y, (3<<16) - x*y^2)
addi sp, sp, -4
sw ra, 0(sp)
jal ra, mul
lw ra, 0(sp)
addi sp, sp, 4
# return y*((3<<16) - x*y^2) value to t2
add t2, a0, x0
# y = y*((3<<16) - x*y^2) >> 17
srli t4, t2, 17
# count = count - 1
addi t5, t5, -1
beq t5, x0, nrRsqrt_done
j newton_raphson_loop
mul:
# set t0 = a0, t1 = a1
add t0, a0, x0
add t1, a1, x0
# set result = a0 = 0
add a0, x0, x0
j mul_loop
mul_loop:
beq t1, x0, mul_done
# compare first bit of y
andi t6, t1, 1
# first bit of y is zero jump to mul_shift
beq t6, x0, mul_shift
# first bit of y is one
# result = result + x
add a0, a0, t0
j mul_shift
mul_shift:
# x <<= 1
slli t0, t0, 1
# y >>= 1
srli t1, t1, 1
j mul_loop
mul_done:
ret
nrRsqrt_done:
# printf("X = ")
li a0, STDOUT
la a1, strX
li a2, 4
li a7, WRITE
ecall
# call intToStr(x)
# put char to buffer
la a1, buffer
addi sp, sp, -4
sw ra, 0(sp)
add a0, t3, x0
jal intToStr
lw ra, 0(sp)
addi sp, sp, 4
# print buffer
li a0, STDOUT
la a1, buffer
li a2, 32
li a7, WRITE
ecall
# printf(",Y = ")
li a0, STDOUT
la a1, strY
li a2, 5
li a7, WRITE
ecall
# call print_float(y)
# put char to buffer
la a1, buffer
addi sp, sp, -4
sw ra, 0(sp)
#srli t5, t4, 16
#add a0, t5, x0
add a0, t4, x0
jal intToStr
lw ra, 0(sp)
addi sp, sp, 4
# print buffer
li a0, STDOUT
la a1, buffer
li a2, 32
li a7, WRITE
ecall
# printf("\n")
li a0, STDOUT
la a1, strEnd
li a2, 1
li a7, WRITE
ecall
lw ra, 0(sp)
addi sp, sp, 4
ret
intToStr:
# assign t0 = x
add t0, a0, x0
# t6 is buffer index
addi t6, a1, 31
# if x = 0, then jump to zero case
beqz t0, intToStr_zero_case
# assign t1 = 10
li t1, 10
j intToStr_loop
intToStr_loop:
# t2(count 10) = 0
li t2, 0
j intToStr_div10
intToStr_div10:
# if x >= 10, then x-10
bge t0, t1, intToStr_sub10
j intToStr_div10_done
intToStr_sub10:
# x = x-10
sub t0, t0, t1
# count++
addi t2, t2, 1
j intToStr_div10
intToStr_div10_done:
# point to next string index
addi t0, t0, '0'
sb t0, 0(t6)
addi t0, t2, 0
addi t6, t6, -1
bnez t0, intToStr_loop
j intToStr_done
intToStr_zero_case:
# put 0 to buffer => string str[0] = '0'
li t0, '0'
sb t0, 0(t6)
addi t6, t6, -1
j intToStr_done
intToStr_done:
# put "\0" on the string end
sb zero, 0(t6)
addi t6, t6, -1
ret
```
:::



## Makefile
:::spoiler More detailed information
```=
include ../../../mk/toolchain.mk
ARCH = -march=rv32izicsr
LINKER_SCRIPT = linker.ld
EMU ?= ../../../build/rv32emu
AFLAGS = -g $(ARCH)
CFLAGS = -g -march=rv32i_zicsr
CFLAGS_O0 = -g -march=rv32i_zicsr -O0
CFLAGS_O1 = -g -march=rv32i_zicsr -O1
CFLAGS_O2 = -g -march=rv32i_zicsr -O2
CFLAGS_O3 = -g -march=rv32i_zicsr -O3
#CFLAGS_Os = -g -march=rv32i_zicsr -Os
CFLAGS_Ofast = -g -march=rv32i_zicsr -Ofast
LDFLAGS = -T $(LINKER_SCRIPT)
LDFLAGS_Os = -T $(LINKER_SCRIPT) -lgcc
EXEC = test.elf
EXECO0 = test_O0.elf
EXECO1 = test_O1.elf
EXECO2 = test_O2.elf
EXECO3 = test_O3.elf
EXECOs = test_Os.elf
EXECOfast = test_Ofast.elf
# test.elf
CC = $(CROSS_COMPILE)gcc
AS = $(CROSS_COMPILE)as
LD = $(CROSS_COMPILE)ld
OBJDUMP = $(CROSS_COMPILE)objdump
OBJS = start.o main.o perfcounter.o chacha20_asm.o LC3370.o hanoi.o
OBJS_O0 = start.o main_O0.o perfcounter.o chacha20_asm.o LC3370.o hanoi.o
OBJS_O1 = start.o main_O1.o perfcounter.o chacha20_asm.o LC3370.o hanoi.o
OBJS_O2 = start.o main_O2.o perfcounter.o chacha20_asm.o LC3370.o hanoi.o
OBJS_O3 = start.o main_O3.o perfcounter.o chacha20_asm.o LC3370.o hanoi.o
#OBJS_Os = start.o main_Os.o perfcounter.o chacha20_asm.o LC3370.o hanoi.o
OBJS_Ofast = start.o main_Ofast.o perfcounter.o chacha20_asm.o LC3370.o hanoi.o
# start.o main.o perfcounter.o chacha20_asm.o
EXECS = $(EXEC) $(EXECO0) $(EXECO1) $(EXECO2) $(EXECO3) $(EXECOfast)
.PHONY: all run dump clean
all: $(EXEC) $(EXECO0) $(EXECO1) $(EXECO2) $(EXECO3) $(EXECOfast)
$(EXEC): $(OBJS) $(LINKER_SCRIPT)
$(LD) $(LDFLAGS) -o $@ $(OBJS)
$(EXECO0): $(OBJS_O0) $(LINKER_SCRIPT)
$(LD) $(LDFLAGS) -o $@ $(OBJS_O0)
$(EXECO1): $(OBJS_O1) $(LINKER_SCRIPT)
$(LD) $(LDFLAGS) -o $@ $(OBJS_O1)
$(EXECO2): $(OBJS_O2) $(LINKER_SCRIPT)
$(LD) $(LDFLAGS) -o $@ $(OBJS_O2)
$(EXECO3): $(OBJS_O3) $(LINKER_SCRIPT)
$(LD) $(LDFLAGS) -o $@ $(OBJS_O3)
$(EXECOs): $(OBJS_Os) $(LINKER_SCRIPT)
$(LD) $(LDFLAGS_Os) -o $@ $(OBJS_Os)
$(EXECOfast): $(OBJS_Ofast) $(LINKER_SCRIPT)
$(LD) $(LDFLAGS) -o $@ $(OBJS_Ofast)
%.o: %.S
$(AS) $(AFLAGS) $< -o $@
%.o: %.c
$(CC) $(CFLAGS) $< -o $@ -c
main_O0.o: main_O0.c
$(CC) $(CFLAGS_O0) $< -o $@ -c
main_O1.o: main_O1.c
$(CC) $(CFLAGS_O1) $< -o $@ -c
main_O2.o: main_O2.c
$(CC) $(CFLAGS_O2) $< -o $@ -c
main_O3.o: main_O3.c
$(CC) $(CFLAGS_O3) $< -o $@ -c
main_Os.o: main_Os.c
$(CC) $(CFLAGS_Os) $< -o $@ -c
main_Ofast.o: main_Ofast.c
$(CC) $(CFLAGS_Ofast) $< -o $@ -c
run: $(EXEC) $(EXECO0) $(EXECO1) $(EXECO2) $(EXECO3) $(EXECOfast)
@test -f $(EMU) || (echo "Error: $(EMU) not found" && exit 1)
@grep -q "ENABLE_ELF_LOADER=1" ../../../build/.config || (echo "Error: ENABLE_ELF_LOADER=1 not set" && exit 1)
@grep -q "ENABLE_SYSTEM=1" ../../../build/.config || (echo "Error: ENABLE_SYSTEM=1 not set" && exit 1)
@for prog in $(EXECS); do \
echo ">>> $(EMU) $$prog"; \
$(EMU) $$prog || exit 1; \
echo ""; \
done
dump:
@read num; \
case $$num in \
1) $(OBJDUMP) -Ds $(EXEC) | less ;; \
2) $(OBJDUMP) -Ds $(EXECO0) | less ;; \
3) $(OBJDUMP) -Ds $(EXECO1) | less ;; \
4) $(OBJDUMP) -Ds $(EXECO2) | less ;; \
5) $(OBJDUMP) -Ds $(EXECO3) | less ;; \
6) $(OBJDUMP) -Ds $(EXECOs) | less ;; \
7) $(OBJDUMP) -Ds $(EXECOfast) | less ;; \
*) echo "無效選擇!" ;; \
esac
clean:
rm -f $(EXEC) $(EXECO0) $(EXECO1) $(EXECO2) $(EXECO3) $(EXECOfast) $(OBJS) $(OBJS_O0) $(OBJS_O1) $(OBJS_O2) $(OBJS_O3) $(OBJS_Os) $(OBJS_Ofast)
dump1 : $(EXEC)
$(OBJDUMP) -Ds $< | less
```
:::
## linker.ld
:::spoiler More detailed information
```=
OUTPUT_ARCH( "riscv" )
ENTRY(_start)
# _start)
SECTIONS
{
. = 0x10000;
.text : {
*(.text._start)
*(.text)
}
.data : { *(.data) }
.bss : {
__bss_start = .;
*(.bss)
__bss_end = .;
}
.stack (NOLOAD) : {
. = ALIGN(16);
. += 4096;
__stack_top = .;
}
}
```
:::
## Analysis Report
### Disassemble the ELF files
#### Leetcode 3370 in hw1
- cycle
| method | -O0 | -O1 | -O2 | -O3 | -Ofast |
| ------------- | ------------- | ------------- | ------------- | ------------- | ------------- |
| c | 7904 | 8308 | 8308 | 8254 | 8254 |
| c unrolling | 20406 | 7904 | 8306 | 8146 | 8146 |
|assembly unrolling| 1905 | 1903 | 1903 | 1903 | 1903 |
- ELF file(c unrolling)
-- dump from O3.elf
```
13d5c: 00100513 li a0,1
13d60: 009005b3 add a1,zero,s1
13d64: 000d0613 mv a2,s10
13d68: 00000073 ecall
13d6c: 6e612b37 lui s6,0x6e612
13d70: 72657ab7 lui s5,0x72657
13d74: 00204a37 lui s4,0x204
13d78: 00100513 li a0,1
13d7c: 02cb0b13 addi s6,s6,44 # 6e61202c <__stack_top+0x6e5fc72c>
13d80: 773a8a93 addi s5,s5,1907 # 72657773 <__stack_top+0x72641e73>
13d84: d20a0a13 addi s4,s4,-736 # 203d20 <__stack_top+0x1ee420>
13d88: ab4fc0ef jal 1003c <print_dec>
13d8c: 00b00c93 li s9,11
13d90: 03612623 sw s6,44(sp)
13d94: 03512823 sw s5,48(sp)
13d98: 03412a23 sw s4,52(sp)
13d9c: 04000893 li a7,64
13da0: 00100513 li a0,1
13da4: 009005b3 add a1,zero,s1
13da8: 000c8613 mv a2,s9
13dac: 00000073 ecall
13db0: 65757937 lui s2,0x65757
13db4: 00100513 li a0,1
13db8: 25490913 addi s2,s2,596 # 65757254 <__stack_top+0x65741954>
13dbc: 00a00413 li s0,10
13dc0: a7cfc0ef jal 1003c <print_dec>
13dc4: 00500d93 li s11,5
13dc8: 03212623 sw s2,44(sp)
13dcc: 02811823 sh s0,48(sp)
13dd0: 04000893 li a7,64
13dd4: 00100513 li a0,1
13dd8: 009005b3 add a1,zero,s1
13ddc: 000d8613 mv a2,s11
13de0: 00000073 ecall
13de4: 03812623 sw s8,44(sp)
13de8: 03712823 sw s7,48(sp)
13dec: 03311a23 sh s3,52(sp)
13df0: 04000893 li a7,64
13df4: 00100513 li a0,1
13df8: 009005b3 add a1,zero,s1
13dfc: 000d0613 mv a2,s10
13e00: 00000073 ecall
```
---
### Command
How to check the elf size?
```
$ riscv-none-elf-size xxx.elf
```
How to check the elf header?
```
$ riscv-none-elf-readelf -h xxx.elf
```
How to use rv_histogram to print histogram of elf?
```
$ ../../../build/rv_histogram xxx.elf
```
---
### -O0
- Size
```
text data bss dec hex filename
27696 119 4112 31927 7cb7 test_O0.elf
```
- ELF header
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10000
Start of program headers: 52 (bytes into file)
Start of section headers: 51904 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 19
Section header string table index: 18
```
- Instruction Frequency histogram
```
+---------------------------------------------+
| RV32 Target Instruction Frequency Histogram |
+---------------------------------------------+
1. xor 19.96% [1334 ] ██████████████████████████████████████████████████
2. addi 12.79% [855 ] ████████████████████████████████
3. slli 11.32% [757 ] ████████████████████████████▎
4. add 11.32% [757 ] ████████████████████████████▎
5. srli 10.58% [707 ] ██████████████████████████▍
6. lw 9.09% [608 ] ██████████████████████▊
7. sw 7.43% [497 ] ██████████████████▋
8. jal 2.89% [193 ] ███████▏
9. lhu 2.02% [135 ] █████
10. lui 1.51% [101 ] ███▊
11. sh 1.36% [91 ] ███▍
12. ecall 1.18% [79 ] ██▉
```
---
### -O1
- Size
```
text data bss dec hex filename
18732 119 4120 22971 59bb test_O1.elf
```
- ELF header
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10000
Start of program headers: 52 (bytes into file)
Start of section headers: 39380 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 21
Section header string table index: 20
```
- Instruction Frequency histogram
```
+---------------------------------------------+
| RV32 Target Instruction Frequency Histogram |
+---------------------------------------------+
1. xor 28.85% [1331 ] ██████████████████████████████████████████████████
2. add 15.73% [726 ] ███████████████████████████▎
3. slli 14.43% [666 ] █████████████████████████
4. srli 14.04% [648 ] ████████████████████████▎
5. addi 8.89% [410 ] ███████████████▍
6. lw 5.03% [232 ] ████████▋
7. sw 3.53% [163 ] ██████
8. jal 2.36% [109 ] ████
9. ecall 1.06% [49 ] █▊
```
---
### -O2
- Size
```
text data bss dec hex filename
18884 119 4112 23115 5a4b test_O2.elf
```
- ELF header
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10000
Start of program headers: 52 (bytes into file)
Start of section headers: 40640 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 22
Section header string table index: 21
```
- Instruction Frequency histogram
```
+---------------------------------------------+
| RV32 Target Instruction Frequency Histogram |
+---------------------------------------------+
1. xor 30.18% [1331 ] ██████████████████████████████████████████████████
2. add 16.55% [730 ] ███████████████████████████▍
3. slli 15.06% [664 ] ████████████████████████▉
4. srli 14.72% [649 ] ████████████████████████▍
5. addi 8.66% [382 ] ██████████████▎
6. lw 3.92% [173 ] ██████▍
7. jal 2.18% [96 ] ███▌
8. sw 2.15% [95 ] ███▌
9. ecall 1.11% [49 ] █▊
```
---
### -O3
- Size
```
text data bss dec hex filename
19304 119 4112 23535 5bef test_O3.elf
```
- ELF header
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10000
Start of program headers: 52 (bytes into file)
Start of section headers: 43712 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 22
Section header string table index: 21
```
- Instruction Frequency histogram
```
+---------------------------------------------+
| RV32 Target Instruction Frequency Histogram |
+---------------------------------------------+
1. xor 29.27% [1331 ] ██████████████████████████████████████████████████
2. add 16.19% [736 ] ███████████████████████████▋
3. slli 14.71% [669 ] █████████████████████████▏
4. srli 14.43% [656 ] ████████████████████████▋
5. addi 9.21% [419 ] ███████████████▋
6. lw 3.96% [180 ] ██████▊
7. sw 2.24% [102 ] ███▊
8. jal 2.22% [101 ] ███▊
9. ecall 1.10% [50 ] █▉
```
---
### -Ofast
The -Ofast option is equivalent to -O3 -ffast-math, and can produce faster code if fast math optimizations are appropriate for your application.
- Size
```
text data bss dec hex filename
19304 119 4112 23535 5bef test_Ofast.elf
```
- ELF header
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10000
Start of program headers: 52 (bytes into file)
Start of section headers: 43720 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 22
Section header string table index: 21
```
- Instruction Frequency histogram
```
+---------------------------------------------+
| RV32 Target Instruction Frequency Histogram |
+---------------------------------------------+
1. xor 29.27% [1331 ] ██████████████████████████████████████████████████
2. add 16.19% [736 ] ███████████████████████████▋
3. slli 14.71% [669 ] █████████████████████████▏
4. srli 14.43% [656 ] ████████████████████████▋
5. addi 9.21% [419 ] ███████████████▋
6. lw 3.96% [180 ] ██████▊
7. sw 2.24% [102 ] ███▊
8. jal 2.22% [101 ] ███▊
9. ecall 1.10% [50 ] █▉
```
---
### Comparison
- This table shows the cycles and instructions for three programs (LC3370, Hanoi, and Newton) under different optimization levels (-O0, -O1, -O2, -O3, -Ofast).
#### cycles
| | -O0 | -O1 | -O2 | -O3 | -Ofast |
| -------- | -------- | -------- | -------- | -------- |-------- |
| LC3370 | 1905 | 1903 | 1903 | 1903 | 1903 |
| Hanoi | 647 | 645 | 645 | 645 | 645 |
| Newton | 229840 | 91847 | 100687 | 99208 | 99208 |
#### instructions
| | -O0 | -O1 | -O2 | -O3 | -Ofast |
| -------- | -------- | -------- | -------- | -------- |-------- |
| LC3370 | 1905 | 1903 | 1905 | 1905 | 1905 |
| Hanoi | 647 | 645 | 645 | 645 | 645 |
| Newton | 229840 | 91847 | 100687 | 99208 | 99208 |
1. LC3370
- Cycles: Regardless of the optimization level, the number of cycles for LC3370 stays constant at 1903.
- Instructions: Similarly, the number of instructions remains unchanged across all optimization levels, at 1905.
2. Hanoi
- Cycles: Like LC3370, the number of cycles for Hanoi stays the same across all optimization levels at 645.
- Instructions: The number of instructions also remains constant at 645, regardless of the optimization level.
3. Newton
- Cycles: The cycle count starts at 229,840 for -O0, and decreases as the optimization level increases. It drops to 91,847 at -O1, further reduces to 100,687 at -O2, and then stabilizes at 99,208 from -O3 onward.
- Instructions: Similarly, the instruction count starts at 229,840 at -O0, and decreases as the optimization level increases, reaching 99,208 at -O3 and remaining constant at -Ofast.
Summary:
- LC3370 and Hanoi show no significant performance changes in terms of cycles and instructions across different optimization levels. Both programs maintain stable performance.
- Newton benefits significantly from optimization. As the optimization level increases, both cycle and instruction counts decrease, showing noticeable improvements in performance, particularly from -O1 onward.
#### Problem in rv_histogram
* I modified main.c and added the memory keyword. This forces the compiler not to optimize or reorder the memory-related operations in the inline assembly code. However, when I use memory, I cannot run rv_histogram because the command results in a 'Bus error (core dumped)'. If I remove memory, rv_histogram works, but printing the string fails.
```
#define printstr(ptr, length) \
do { \
asm volatile( \
"add a7, x0, 0x40;" \
"add a0, x0, 0x1;" /* stdout */ \
"add a1, x0, %0;" \
"mv a2, %1;" /* length character */ \
"ecall;" \
: \
: "r"(ptr), "r"(length) \
: "a0", "a1", "a2", "a7", "memory");\
} while (0)
```
## Reference
* [Computer Architecture HW2](https://hackmd.io/@sysprog/2025-arch-homework2)
* [Assignment1: RISC-V Assembly and Instruction Pipeline](https://hackmd.io/9nh3WoYbTXyg9KSbWZjwxA?view)
* [RISC-V Instruction Set Manual](https://riscv.org/specifications/ratified/)
* [Quiz2 of Computer Architecture (2025 Fall)](https://hackmd.io/@sysprog/arch2025-quiz2)
* [Quiz3 of Computer Architecture (2025 Fall)](https://hackmd.io/@sysprog/arch2025-quiz3)
* [RISC-V Instruction Set Simulator and System Emulator](https://hackmd.io/@sysprog/Sko2Ja5pel)