[toc]
# Assignment2: RISC-V Complete Application
contributed by [< jningmin >](https://github.com/jningmin)
# Pick a question from hw1
The following question is picked from the Assignment 1 : **leetcode_70_climbing_stairs**
>leetcode_70_climbing_stairs
You are climbing a staircase. It takes n steps to reach the top.
Each time you can either climb 1 or 2 steps. In how many distinct ways can you climb to the top?
[C version of Source code](https://hackmd.io/kvq6kXeRRmeeJGepGiX1Mg#HW1_mainc)
Implements is referance to assignment 1 leetcode70 [version2 ](https://hackmd.io/_0UERjMoRcaRxV6aGN5fkw?stext=29986%3A2533%3A0%3A1762768139%3Acymaty&both=)
In a **bare-metal environment** using rv32emu’s system emulation you have to provide M extensions in I extension
The following are the algebraic mathematical functions we will need to use additionally.
```c
/* Software division for RV32I (no M extension) */
static unsigned long udiv(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long quotient = 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
quotient |= (1UL << i);
}
}
return quotient;
}
static unsigned long umod(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
}
}
return remainder;
}
/* Software multiplication for RV32I (no M extension) */
static uint32_t umul(uint32_t a, uint32_t b)
{
uint32_t result = 0;
while (b) {
if (b & 1)
result += a;
a <<= 1;
b >>= 1;
}
return result;
}
```
The following are the basic **vector implementation** methods.
```c
void vector_init(vector_t *v)
{
v->data = NULL;
v->size = 0;
v->count = 0;
v->free_slot = 0;
}
int32_t vector_push(vector_t *v, void *ptr)
{
if (!v->size) {
v->size = VECTOR_MIN_SIZE;
v->data = calloc(v->size, sizeof(void *));
}
if (v->free_slot && v->free_slot < v->count) {
size_t idx = v->free_slot;
v->data[idx] = ptr;
v->free_slot = 0;
return idx;
}
if (v->count == v->size) {
v->size *= 2;
v->data = realloc(v->data, v->size * sizeof(void *));
memset(v->data + v->count, 0, (v->size - v->count) * sizeof(void *));
}
v->data[v->count] = ptr;
return v->count++;
}
void *vector_pop(vector_t *v)
{
if (!v->count)
return NULL;
void *last = v->data[--v->count];
v->data[v->count] = NULL;
return last;
}
void vector_free(vector_t *v)
{
if (!v->data)
return;
free(v->data);
v->data = NULL;
v->size = 0;
v->count = 0;
v->free_slot = 0;
}
```
The followings are main steps combined functions.
```c
void generate_combinations(int n, vector_t *current) {
if (n == 0) {
TEST_LOGGER("[");
for (size_t i = 0; i < current->count; i++) {
print_dec((unsigned long)(intptr_t)current->data[i]);
if (i < current->count - 1)
TEST_LOGGER(",");
}
TEST_LOGGER("]\n");
return;
}
if (n >= 1) {
vector_push(current, (void *)(intptr_t)1);
generate_combinations(n - 1, current);
vector_pop(current);
}
if (n >= 2) {
vector_push(current, (void *)(intptr_t)2);
generate_combinations(n - 2, current);
vector_pop(current);
}
}
void stairs(int n) {
if (n <= 0) {
TEST_LOGGER("No stairs to climb.\n");
return;
}
TEST_LOGGER("Climbing ");
print_dec((unsigned long)n);
TEST_LOGGER(" stairs:\n");
vector_t path;
vector_init(&path);
generate_combinations(n, &path);
vector_free(&path);
}
```
The following is the print decimal function
```c
static void print_dec(unsigned long val)
{
char buf[20];
char *p = buf + sizeof(buf) - 1;
*p = '\0';
p--;
if (val == 0) {
*p = '0';
p--;
} else {
while (val > 0) {
*p = '0' + umod(val, 10);
p--;
val = udiv(val, 10);
}
}
p++;
printstr(p, (buf + sizeof(buf) -1 - p));
}
```
The Hand Written Assembly
[ASM version of Source code](https://hackmd.io/kvq6kXeRRmeeJGepGiX1Mg?both#HW1_stairsS)
Implements is referance to assignment 1 leetcode70 [version2 ](https://hackmd.io/_0UERjMoRcaRxV6aGN5fkw#Assembly-code--version_2)
The main `difference` between RIPES and system simulation using RV32EMU is:
1.You have to set a stack_top at _start
```s
.globl _start
_start:
la sp, __stack_top
call main
li a7, EXIT
ecall
```
2.Completely different output method
```s
li a0,STDOUT
la a1, msg
li a2,18
li a7, WRITE
ecall
```
I chose this question because it fits the Riscvi pattern perfectly, making it suitable for learning a simulator that you're just starting out with.
**In a bare-metal environment, program behavior is similar to that of a regular compiler, but the presentation is different.**
In a bare-metal environment, there is no operating system or standard C library to handle dynamic memory, so we must implement` malloc`, `free`, `realloc`, and `calloc` ourselves.
```c
void *malloc(size_t size) {
if (size == 0) return NULL;
/* align to 8 bytes */
size = (size + 7) & ~((size_t)7);
if (__simple_heap_top + size > SIMPLE_HEAP_SIZE) return NULL;
void *p = &__simple_heap[__simple_heap_top];
__simple_heap_top += size;
return p;
}
void free(void *ptr) {
/* no-op for bump allocator */
(void)ptr;
}
void *realloc(void *ptr, size_t size) {
if (ptr == NULL) return malloc(size);
if (size == 0) { free(ptr); return NULL; }
void *newp = malloc(size);
if (!newp) return NULL;
memcpy(newp, ptr, size);
return newp;
}
void *calloc(size_t nmemb, size_t size) {
size_t total = nmemb * size;
void *p = malloc(total);
if (p) {
unsigned char *q = p;
for (size_t i = 0; i < total; ++i) q[i] = 0;
}
return p;
}
```
## hw1_c output after compiled
```s
#make run
../../../build/rv32enu test.elf
16:06:27 INFO src/riscv.c:552: Log level: TRACE
16:06:27 INFO src/riscv.c:565: test.elf ELF loaded
16:06:27 INFO src/main.c: 315: RISC-V emulator is created and ready to run
== Lab2_Homeworkl Tests ===
Test 1: n = 3
Climbing 3 stairs:
[1,1,1]
[1,2]
[2,1]
Cycles: 11689 Instructions: 11689
Test 2: n = 4
Climbing 4 stairs:
[1,1,1,1]
[1,1,2]
[1,2,1]
[2,1,1]
[2,2]
Cycles: 22371 Instructions: 22371
Test 3:n = 5
Climbing 5 stairs:
[1,1,1,2]
[1,1,2,1]
[1,2,1,1]
[1,2,2]
[2,1,1,1]
[2,1,2]
[2,2,1]
Cycles: 42258 Instructions: 42258
— ALL Tests Completed ==
16:06:27 INFO src/main.c:338: RISC-V emulator
```
**Compiling using RV32I bare-metal programming**
To compile the .c file, I use the following command.
```c
make
riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -O2 -c main.c -o main.o
riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -nostartfiles -T linker.ld main.o -o test.elf
```
## hw1_S output after compiled
```s
#make тип
../../../build/rv32emu test.elf
21:08:35 INFO src/riscv.c:552: Log level: TRACE
21:08:35 INFO src/riscv.c:565: test.elf ELF loaded
21:08:35 INFO src/main.c:315: RISC-V emulator is created and ready to run
How many stairs?
n = 3
Step combinations:
{ 1 1 1 }
{ 1 2 ]
{ 1 2 }
Cycles: 631
п = 4
Step combinations:
{ 1 1 1 1 }
{ 1 1 2 }
{ 1 2 1 }
{ 2 1 1 }
{ 2 2 }
CycLes: 1250
n = 5
Step combinations:
{ 1 1 1 1 1 }
{ 1 1 1 2 }
{ 1 1 2 1 }
{ 1 2 1 1 }
{ 1 2 2 }
{ 2 1 1 1 }
{ 2 1 2 }
{ 2 2 1 }
CycLes: 2352
21:08:35 INFO src/main.c: 338: RISC-V emulator is destroyed
```
**Compiling using RV32I bare-metal programming**
To compile the .S file, I use the following command.
```c
make
riscv-none-elf-ld -T linker.ld -o test.elf perfcounter.o stairs.o
riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -nostartfiles -T linker.ld perfcounter.o stairs.o -o test.elf
```
From the two outputs above, we can conclude that the cycle time using C version is significantly longer than RISC-V version.
# Quiz2_problem_A
This program approaches the Tower of Hanoi puzzle with an iterative strategy. It leverages Gray codes, where each code sequence acts as steps along a Hamiltonian path on an n-dimensional hypercube, with each bit representing a position or coordinate.
[C version of Source code](https://hackmd.io/kvq6kXeRRmeeJGepGiX1Mg?both#quiz2_mainc)
We got the same print decimal, algebraic mathematical functions as leet_code_70
The main functions of Hanoi algorithm and Gray code transfer
```c
unsigned char gray_code_trans(unsigned char n) {
return n ^ (n >> 1);
}
void hanoi_with_gray(int n, int* moves) {
unsigned char current, previous = 0;
for (int i = 1; i <= (1 << n) - 1; i++) {
current = gray_code_trans(i);
unsigned char diff = current ^ previous;
// find change bit
int disk = 0;
while ((diff & (1 << disk)) == 0) {
disk++;
}
int from_peg = moves[disk];
int to_peg;
if (disk == 0) {
// smallest disk
int small_shift = (umod((unsigned long)n, 2) == 0) ? 1 : 2;
to_peg = umod((unsigned long)(moves[0] + small_shift + 3), 3);
} else {
int tmp = 3 - moves[disk] - moves[0];
to_peg = umod((unsigned long)(tmp + 3), 3);
}
TEST_LOGGER("Move disk ");
char disk_letter = 'A' + disk;
TEST_OUTPUT(&disk_letter, 1);
TEST_LOGGER(" from peg ");
print_dec(from_peg + 1);
TEST_LOGGER(" to peg ");
print_dec(to_peg + 1);
TEST_LOGGER("\n");
moves[disk] = to_peg;
previous = current;
}
}
```
[ASM version of Source code](https://hackmd.io/kvq6kXeRRmeeJGepGiX1Mg?both#quiz2_hanoi_graycodeS)
`hanoi:`Initializes the loop limit (1<<n)-1 and iteratively generates each move.
Each iteration converts the step number to a Gray code, compares it with the previous value, and determines which disk changed.
```s
hanoi:
# bound = (1 << n) - 1
li t0, 1
sll s4, t0, s1 # s4 = 1 << n
addi s4, s4, -1 # s4 = (1<<n) - 1
li t1, 1 # i = 1
li s2, 0 # previous = 0
hanoi_loop:
bgt t1, s4, hanoi_end
# current = gray_code_trans(i)
mv a0, t1
jal ra, gray_code_trans
mv s3, a0 # s3 = current
xor t2, s3, s2 # diff = current ^ previous
li t3, 0 #disk
```
`find_disk:`Finds the disk corresponding to the flipped bit (difference between current and previous Gray code).
Then determines the source peg and computes the destination peg based on Hanoi movement rules (different logic for the smallest disk vs. larger ones).
```s
find_disk:
li t4, 1
sll t4, t4, t3 # mask = 1 << t3
and t5, t2, t4
bnez t5, disk_found
addi t3, t3, 1
j find_disk
disk_found:
# t3 = disk index (0=A,1=B,2=C)
# load current pos of that disk
la t6, moves
slli t4, t3, 2
add t6, t6, t4
lw t0, 0(t6) # t0 = from_peg (0..2)
mv s5, t0 # save from peg in s5
# compute to_peg ,t0 will be overwritten with target
# if disk == 0 , smallest disk
beq t3, x0, handle_small
# else (larger disk)
la t6, moves
lw s7, 0(t6) # t7 = moves[0] (ref)
li s8, 3
sub s8, s8, t0 # t8 = 3 - cur
sub s8, s8, s7 # t8 = 3 - cur - ref
# normalize t8 into 0..2
blt s8, x0, sub_add3_larger
li s9, 3
blt s8, s9, store_target_large
sub s8, s8, s9
j store_target_large
sub_add3_larger:
addi s8, s8, 3
store_target_large:
mv t0, s8
j do_print_and_store
handle_small:
# small_shift = (n % 2 == 0) ? 1 : 2
andi t6, s1, 1 # t6 = n & 1
beq t6, x0, small_shift_is_one
li t6, 2
j small_shift_done
small_shift_is_one:
li t6, 1
small_shift_done:
add s8, t0, t6 # t8 = moves[0] + small_shift
li s9, 3
blt s8, s9, store_target_small
sub s8, s8, s9
store_target_small:
mv t0, s8
```
`gray_code_trans:` Converts an integer i into Gray code using: gray = i ^ (i >> 1)
This ensures only one disk changes per step (key to iterative Hanoi).
```s
gray_code_trans:
mv t0, a0
srli s10, a0, 1
xor a0, t0, s10
mv t0,x0
ret
```
`print_int:` Prints an integer without using hardware division.
The number is repeatedly reduced by 10 to extract digits, converts each digit to ASCII, and outputs using a system call.
```s
print_int:
addi sp, sp, -20
sw ra, 0(sp)
sw t0, 4(sp)
sw t1, 8(sp)
sw t2, 12(sp)
sw t3, 16(sp)
la t0, buffer
addi t1, t0, 10
sb x0, 0(t1) # '\0'
beqz a0, print_zero_I
print_int_loop_I:
# --- 計算 a0 / 10 與餘數 ---
li t3, 0 # q = t3
li t4, 0 # rem = t4
li t2, 10
div_loop_I:
blt a0, t2, div_end_I # a0 < 10
addi a0, a0, -10
addi t3, t3, 1 # q + 1
j div_loop_I
div_end_I:
mv t4, a0 # rem
mv a0, t3 # q = a0
addi t4, t4, 48 # ASCII
addi t1, t1, -1
sb t4, 0(t1)
bnez a0, print_int_loop_I
# --- OUTPUT ---
li a0, 1
mv a1, t1
la t2, buffer
addi t2, t2, 10
sub a2, t2, t1
li a7, 64
ecall
j print_int_end_I
print_zero_I:
li t2, 48
sb t2, 0(t0)
li a0, 1
la a1, buffer
li a2, 1
li a7, 64
ecall
print_int_end_I:
lw ra, 0(sp)
lw t0, 4(sp)
lw t1, 8(sp)
lw t2, 12(sp)
lw t3, 16(sp)
addi sp, sp, 20
ret
```
## quiz2_hanoi_gray_bare.c output after compiled
```s
#make run
../../. ./build/rv32emu test.elf
12:41:24 INFO STC/riscv.c:552: Log level: TRACE
12:41:24 INFO src/riscv.c:565: test.elf ELF Loaded
12:41:24 INFO src/main.c:315: RISC-V emulator is created and ready to run
Moving 3 disks using Gray code sequence:
Move disk A from peg 1 to peg 3
Move disk B from peg 1 to peg 2
Move disk A from peg 3 to peg 2
Move disk C from peg 1 to peg 3
Move disk A from peg 2 to peg 1
Move disk B from peg 2 to peg 3
Move disk A from peg 1 to peg 3
Cycles: 25199 Instructions: 25199
12:41:24 INFO src/main.c:338: RISC-V emulator is destroyed
```
**Compiling using RV32I bare-metal programming**
To compile the .c file, I use the following command.
```c
make
riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -O2 -c main.c -o main.o
riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -nostartfiles -T linker.ld main.o -o main.elf
```
## quiz2_hanoi_gray_bare.S output after compiled
```s
#make run
../../../build/rv32emu test.elf
14:57:17 INFO src/riscv.c: 552: Log Level: TRACE
14:57:17 INFO src/riscv.c: 565: test.elf ELF Loaded
14:57:17 INFO src/main.c:315: RISC-V emulator is created and ready to run
Moving 3 disks using Gray code sequence:
Move disk A from peg 1 to peg 3
Move disk B from peg 1 to peg 2
Move disk A from peg 3 to peg 2
Move disk C from peg 1 to peg 3
Move disk A from peg 2 to peg 1
Move disk B from peg 2 to peg 3
Move disk A from peg 1 to peg 3
Cycles: 704
14:57:17 INFO src/main.c:338: RISC-V emulator is destroyed
```
**Compiling using RV32I bare-metal programming**
To compile the .S file, I use the following command.
```c
make
riscv-none-elf-ld -T linker.ld -o test.elf perfcounter.o hanoi_gray_bare.o
riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -nostartfiles -T linker.ld perfcounter.o hanoi_gray_bare.o -o test.elf
```
From the two outputs above, we can conclude that the cycle time using C version is significantly longer than RISC-V version.
# Quiz3_problem_C
`rsqrt` implements a fast reciprocal square root on RV32I, optimized to run without a hardware multiplier. It uses a LUT-based initial guess followed by Newton iterations to refine accuracy. All multiplications are performed using shift-add logic. The result achieves roughly 3–8% error, balancing speed and precision.

Final formula:
$$
y_{n+1} = y_n \left(\frac{3 - xy_n^2}{2}\right) = y_n \left(\frac{3}{2} - \frac{xy_n^2}{2}\right)
$$
[C version of Source code](https://hackmd.io/kvq6kXeRRmeeJGepGiX1Mg?both#quiz3_mainc)
`fast_rsqrt():`Computes an approximate reciprocal square root (1 / sqrt(x)) in Q16 fixed-point, optimized for RV32I.
>Steps:
Get exponent of x using clz()
Use lookup table (rsqrt_table) to get initial guess
Optional linear interpolation makes the guess more accurate
Performs two iterations of Newton's method
Newton update formula (fixed-point version):
y = y * (1.5 – 0.5 * x * y²)
Very fast and avoids float hardware.
```C
uint32_t fast_rsqrt(uint32_t x)
{
if (x == 0) return 0xFFFFFFFF; /* saturate */
if (x == 1) return (1U << SCALE); /* 1.0 in Q16 */
/* exponent = floor(log2(x)) */
int exp = 31 - clz(x);
uint32_t y = rsqrt_table[exp]; /* Q16 initial */
/* linear interpolation for better initial guess */
if (x > (1u << exp)) {
uint32_t y1 = (exp < 31) ? rsqrt_table[exp + 1] : 0;
uint32_t diff = (y > y1) ? (y - y1) : 0;
uint32_t frac = (uint32_t)((((uint64_t)x - (1ULL << exp)) << SCALE) >> exp); /* Q16 fraction */
y -= (uint32_t)( (mul32(diff, frac)) >> SCALE ); /* keep Q16 */
}
for (int i = 0; i < 2; i++) {
uint64_t y2 = mul32(y, y); /* Q32 */
uint32_t y2_hi = (uint32_t)(y2 >> 16); /* Q16 */
uint64_t xy2 = mul32(x, y2_hi); /* A_Q16 */
uint64_t three_q16 = (uint64_t)3U << SCALE; /* 3 in Q16 */
/* compute B_Q16 = ((3<<16) - xy2) >> 1 (still fits <= 32 bits for valid inputs) */
uint64_t tmp;
if (xy2 >= three_q16) tmp = 0; else tmp = (three_q16 - xy2) >> 1;
uint32_t B_q16 = (uint32_t)tmp;
y = (uint32_t)( mul32(y, B_q16) >> SCALE ); /* Q16 */
}
return y;
}
```
`uint32_t fast_distance_3d():`Computes distance = sqrt(dx² + dy² + dz²) but avoids real square root.
>Steps:
Take absolute values of dx, dy, dz
Compute squared distance: dx² + dy² + dz²
Scale down if overflow risk
Compute 1/sqrt(dist_sq) via fast_rsqrt()
Multiply result back to recover sqrt()
Faster than integer sqrt, especially on RV32I.
```C
uint32_t fast_distance_3d(int32_t dx, int32_t dy, int32_t dz)
{
/* absolute values */
dx = (dx < 0) ? -dx : dx;
dy = (dy < 0) ? -dy : dy;
dz = (dz < 0) ? -dz : dz;
/* sum of squares (use 64-bit) */
uint64_t dist_sq = mul32((uint32_t)dx, (uint32_t)dx) +
mul32((uint32_t)dy, (uint32_t)dy) +
mul32((uint32_t)dz, (uint32_t)dz);
uint32_t scaled_dist_sq;
int shift = 0;
if (dist_sq > 0xFFFFFFFFULL) {
scaled_dist_sq = (uint32_t)(dist_sq >> 16); /* scale down */
shift = 8; /* will multiply result by 2^8 later */
} else {
scaled_dist_sq = (uint32_t)dist_sq;
}
uint32_t inv_sqrt = fast_rsqrt(scaled_dist_sq); /* Q16 */
uint64_t result = mul32(scaled_dist_sq, inv_sqrt) >> SCALE; /* back to Q0 (integer) */
if (shift) result <<= shift;
return (result > UINT32_MAX) ? UINT32_MAX : (uint32_t)result;
}
```
`mul32()/udiv64()`: These functions exist because RV32I does not have a hardware multiplier (no M-extension).
```C
static uint64_t mul32(uint32_t a, uint32_t b)
{
uint64_t result = 0;
for (int i = 0; i < 32; i++) {
if (b & (1U << i)) {
result += (uint64_t)a << i;
}
}
return result;
}
static uint64_t udiv64(uint64_t dividend, uint32_t divisor)
{
if (divisor == 0) return (uint64_t)-1;
uint64_t quotient = 0;
uint64_t rem = 0;
for (int i = 63; i >= 0; i--) {
rem = (rem << 1) | ((dividend >> i) & 1ULL);
if (rem >= divisor) {
rem -= divisor;
quotient |= (1ULL << i);
}
}
return quotient;
}
```
`clz()`:Counts the number of leading zeros in a 32-bit integer.
Used to compute log2(x) efficiently, which determines the magnitude of x during rsqrt.
```C
static int clz(uint32_t x){
if (!x) return 32;
int n = 0;
if ((x & 0xFFFF0000) == 0) { n += 16; x <<= 16; }
if ((x & 0xFF000000) == 0) { n += 8; x <<= 8; }
if ((x & 0xF0000000) == 0) { n += 4; x <<= 4; }
if ((x & 0xC0000000) == 0) { n += 2; x <<= 2; }
if ((x & 0x80000000) == 0) { n += 1; }
return n;
}
```
`my_abs() / f_abs()`:Computes absolute values using bit manipulation instead of branching.
my_abs() handles signed integers;
f_abs() returns the absolute difference between two unsigned integers.
```C
static int32_t my_abs(int32_t x) {
int32_t mask = x >> 31;
return (x + mask) ^ mask;
}
static uint32_t f_abs(uint32_t x, uint32_t y) {
return (x >= y) ? (x - y) : (y - x);
}
```
`rsqrt_table[32]`:A precomputed Q16 fixed-point lookup table storing an approximate initial value of 1/sqrt(x) based on the exponent of x.
This drastically reduces the number of Newton iterations required.
```C
static const uint16_t rsqrt_table[32] = {
65535U, 46341U, 32768U, 23170U, 16384U,
11585U, 8192U, 5793U, 4096U, 2896U,
2048U, 1448U, 1024U, 724U, 512U, 362U,
256U, 181U, 128U, 90U, 64U, 45U,
32U, 23U, 16U, 11U, 8U, 6U, 4U, 3U, 2U, 1U
};
```
`my_sqrt():`Bit-wise integer square root implementation.
Useful for comparison or fallback, but slower than fast_rsqrt().
```C
static uint32_t my_sqrt(uint32_t x) {
if (x == 0) return 0;
uint32_t result = 0;
uint32_t bit = 1U << 30;
while (bit > x) bit >>= 2;
while (bit != 0) {
if (x >= result + bit) {
x -= result + bit;
result = (result >> 1) + bit;
} else {
result >>= 1;
}
bit >>= 2;
}
return result;
}
```
[ASM version of Source code]()
## quiz3_rsqrt_bare.c output after compiled
```S
#make run
../../../build/rv32emu test.elf
15:51:17 INFO src/riscv.c:552: Log Level: TRACE
15:51:17 INFO src/riscv.c:565: test.elf ELF Loaded
15:51:17 INFO src/main.c:315: RISC-V emulator is created and ready to run
=== Part 1: Reciprocal Square Root Comparison ===
x = 4 fast = 0.5000 expected = 0.5000 error = 0.000%
Cycles: 2865
Instructions: 2865
x = 2000 fast = 0.0223 expected = 0.0223 error = 0.000%
Cycles: 4313
Instructions: 4313
x = 50000 fast = 0.0058 expected = 0.0044 error = 30.716%
Cycles: 4050
Instructions: 4050
= Part 2: 3D Distance Approxination (scaled correctly) ==
x = (3,4,12) fast = 13 expected = 13 error = 0%
Cycles: 6252
Instructions: 6252
x = (10,20,30) fast = 37 expected = 37 error = 0%
Cycles: 5906
Instructions: 5906
x = (100,200,300) fast = 245 expected = 374 error = 34%
CycLes: 5343
Instructions: 5343
15:51:17 INFO src/main.c: 338: RISC-V emulator is destroyed
```
**Compiling using RV32I bare-metal programming**
```c
make
riscv-none-elf-ld -T linker.ld start.o perfcounter.o rsqrt_bare.o -o test.elf
riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -nostartfiles -T linker.ld start.o perfcounter.o rsqrt_bare.o -o test.elf
```
From the above output, we can conclude that a larger test case size does not necessarily mean a longer cycle time, but a larger test case size may result in a higher percentage of errors.
# Full code
## HW1_main.c
```c
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#define SIMPLE_HEAP_SIZE (16 * 1024)
static unsigned char __simple_heap[SIMPLE_HEAP_SIZE];
static size_t __simple_heap_top = 0;
void *malloc(size_t size) {
if (size == 0) return NULL;
/* align to 8 bytes */
size = (size + 7) & ~((size_t)7);
if (__simple_heap_top + size > SIMPLE_HEAP_SIZE) return NULL;
void *p = &__simple_heap[__simple_heap_top];
__simple_heap_top += size;
return p;
}
void free(void *ptr) {
/* no-op for bump allocator */
(void)ptr;
}
void *realloc(void *ptr, size_t size) {
if (ptr == NULL) return malloc(size);
if (size == 0) { free(ptr); return NULL; }
void *newp = malloc(size);
if (!newp) return NULL;
memcpy(newp, ptr, size);
return newp;
}
void *calloc(size_t nmemb, size_t size) {
size_t total = nmemb * size;
void *p = malloc(total);
if (p) {
unsigned char *q = p;
for (size_t i = 0; i < total; ++i) q[i] = 0;
}
return p;
}
void *memset(void *s, int c, size_t n) {
unsigned char *p = s;
unsigned char uc = (unsigned char)c;
for (size_t i = 0; i < n; ++i) p[i] = uc;
return s;
}
#define VECTOR_MIN_SIZE 16
typedef struct {
void **data;
size_t size; /* Allocated size */
size_t count; /* Number of elements */
size_t free_slot; /* Index of a known hole */
} vector_t;
#define printstr(ptr, length) \
do { \
asm volatile( \
"add a7, x0, 0x40;" \
"add a0, x0, 0x1;" /* stdout */ \
"add a1, x0, %0;" \
"mv a2, %1;" /* length character */ \
"ecall;" \
: \
: "r"(ptr), "r"(length) \
: "a0", "a1", "a2", "a7"); \
} while (0)
#define TEST_OUTPUT(msg, length) printstr(msg, length)
#define TEST_LOGGER(msg) \
{ \
char _msg[] = msg; \
TEST_OUTPUT(_msg, sizeof(_msg) - 1); \
}
extern uint64_t get_cycles(void);
extern uint64_t get_instret(void);
/* Bare metal memcpy implementation */
void *memcpy(void *dest, const void *src, size_t n)
{
uint8_t *d = (uint8_t *) dest;
const uint8_t *s = (const uint8_t *) src;
while (n--)
*d++ = *s++;
return dest;
}
/* Software division for RV32I (no M extension) */
static unsigned long udiv(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long quotient = 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
quotient |= (1UL << i);
}
}
return quotient;
}
static unsigned long umod(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
}
}
return remainder;
}
/* Software multiplication for RV32I (no M extension) */
static uint32_t umul(uint32_t a, uint32_t b)
{
uint32_t result = 0;
while (b) {
if (b & 1)
result += a;
a <<= 1;
b >>= 1;
}
return result;
}
/* Provide __mulsi3 for GCC */
uint32_t __mulsi3(uint32_t a, uint32_t b)
{
return umul(a, b);
}
/* Simple integer to decimal string conversion */
static void print_dec(unsigned long val)
{
char buf[20];
char *p = buf + sizeof(buf);
*--p = '\0';
if (val == 0) {
*--p = '0';
} else {
while (val > 0) {
*--p = '0' + umod(val, 10);
val = udiv(val, 10);
}
}
printstr(p, (buf + sizeof(buf) -1 - p));
}
/* --------Vector*/
void vector_init(vector_t *v)
{
v->data = NULL;
v->size = 0;
v->count = 0;
v->free_slot = 0;
}
int32_t vector_push(vector_t *v, void *ptr)
{
if (!v->size) {
v->size = VECTOR_MIN_SIZE;
v->data = calloc(v->size, sizeof(void *));
}
if (v->free_slot && v->free_slot < v->count) {
size_t idx = v->free_slot;
v->data[idx] = ptr;
v->free_slot = 0;
return idx;
}
if (v->count == v->size) {
v->size *= 2;
v->data = realloc(v->data, v->size * sizeof(void *));
memset(v->data + v->count, 0, (v->size - v->count) * sizeof(void *));
}
v->data[v->count] = ptr;
return v->count++;
}
void *vector_pop(vector_t *v)
{
if (!v->count)
return NULL;
void *last = v->data[--v->count];
v->data[v->count] = NULL;
return last;
}
void vector_free(vector_t *v)
{
if (!v->data)
return;
free(v->data);
v->data = NULL;
v->size = 0;
v->count = 0;
v->free_slot = 0;
}
void generate_combinations(int n, vector_t *current) {
if (n == 0) {
TEST_LOGGER("[");
for (size_t i = 0; i < current->count; i++) {
print_dec((unsigned long)(intptr_t)current->data[i]);
if (i < current->count - 1)
TEST_LOGGER(",");
}
TEST_LOGGER("]\n");
return;
}
if (n >= 1) {
vector_push(current, (void *)(intptr_t)1);
generate_combinations(n - 1, current);
vector_pop(current);
}
if (n >= 2) {
vector_push(current, (void *)(intptr_t)2);
generate_combinations(n - 2, current);
vector_pop(current);
}
}
void stairs(int n) {
if (n <= 0) {
TEST_LOGGER("No stairs to climb.\n");
return;
}
TEST_LOGGER("Climbing ");
print_dec((unsigned long)n);
TEST_LOGGER(" stairs:\n");
vector_t path;
vector_init(&path);
generate_combinations(n, &path);
vector_free(&path);
}
int main(void)
{
uint64_t start_cycles, end_cycles, cycles_elapsed;
uint64_t start_instret, end_instret, instret_elapsed;
TEST_LOGGER("\n=== Lab2_Homework1 Tests ===\n");
/* Test 1:*/
TEST_LOGGER("Test 1: n = 3\n");
start_cycles = get_cycles();
start_instret = get_instret();
stairs(3);
end_cycles = get_cycles();
end_instret = get_instret();
cycles_elapsed = end_cycles - start_cycles;
instret_elapsed = end_instret - start_instret;
TEST_LOGGER(" Cycles: ");
print_dec((unsigned long) cycles_elapsed);
TEST_LOGGER(" Instructions: ");
print_dec((unsigned long) instret_elapsed);
TEST_LOGGER("\n");
/* Test 2: */
TEST_LOGGER("Test 2: n = 4\n");
start_cycles = get_cycles();
start_instret = get_instret();
stairs(4);
end_cycles = get_cycles();
end_instret = get_instret();
cycles_elapsed = end_cycles - start_cycles;
instret_elapsed = end_instret - start_instret;
TEST_LOGGER(" Cycles: ");
print_dec((unsigned long) cycles_elapsed);
TEST_LOGGER(" Instructions: ");
print_dec((unsigned long) instret_elapsed);
TEST_LOGGER("\n");
/* Test 3: */
TEST_LOGGER("Test 3:n = 5\n");
start_cycles = get_cycles();
start_instret = get_instret();
stairs(5);
end_cycles = get_cycles();
end_instret = get_instret();
cycles_elapsed = end_cycles - start_cycles;
instret_elapsed = end_instret - start_instret;
TEST_LOGGER(" Cycles: ");
print_dec((unsigned long) cycles_elapsed);
TEST_LOGGER(" Instructions: ");
print_dec((unsigned long) instret_elapsed);
TEST_LOGGER("\n");
TEST_LOGGER("\n=== All Tests Completed ===\n");
return 0;
}
```
## HW1_stairs.S
```s
.data
.align 2
msg: .string "How many stairs?\n"
.align 2
msg1: .string "Step combinations:\n"
.align 2
msg2: .string "{ "
.align 2
msg3: .string "}\n"
.align 2
msg4: .string " "
.align 2
msg5: .string "\n"
.align 2
msg_cycles: .string " Cycles: "
.align 2
testcase_a: .string "n = 3"
.align 2
testcase_b: .string "n = 4"
.align 2
testcase_c: .string "n = 5"
.align 2
steps: .space 256
.align 2
buffer: .space 64
.equ STDOUT, 1
.equ WRITE, 64
.equ EXIT, 93
.text
.align 2
.globl _start
_start:
la sp, __stack_top
call main
li a7, EXIT
ecall
main:
# Print "How many stairs?"
li a0,STDOUT
la a1, msg
li a2,18
li a7, WRITE
ecall
##------------------A---------------------------
li a0, STDOUT
la a1, testcase_a
li a2, 5
li a7, WRITE
ecall
li a0, STDOUT
la a1, msg5
li a2, 1
li a7, WRITE
ecall
li a0, STDOUT
la a1, msg1
li a2, 19
li a7, WRITE
ecall
li t4, 3 # n = 3
mv s2, x0
mv s3, x0
##------------cycle-------------------------------
call get_cycles
mv s4, a0 # low
mv s5, a1 # high
jal ra, printSteps
call get_cycles
mv t0, a0 # end low
mv t1, a1 # end high
# 計算差值
sub s1, t0, s4
sub s6, t1, s5
# 輸出 cycles
li a0, STDOUT
la a1, msg_cycles
li a2, 10
li a7, WRITE
ecall
mv a0, s1 # low bits
jal ra, print_int
li a0, STDOUT
la a1, msg5
li a2, 1
li a7, WRITE
ecall
##------------------B---------------------------
li a0, STDOUT
la a1, msg5
li a2, 1
li a7, WRITE
ecall
li a0, STDOUT
la a1, testcase_b
li a2, 5
li a7, WRITE
ecall
li a0, STDOUT
la a1, msg5
li a2, 1
li a7, WRITE
ecall
li a0, STDOUT
la a1, msg1
li a2, 19
li a7, WRITE
ecall
li t4, 4 # n = 4
mv s2, x0
mv s3, x0
##------------cycle-------------------------------
call get_cycles
mv s4, a0 # low
mv s5, a1 # high
jal ra, printSteps
call get_cycles
mv t0, a0 # end low
mv t1, a1 # end high
# 計算差值
sub s1, t0, s4
sub s6, t1, s5
# 輸出 cycles
li a0, STDOUT
la a1, msg_cycles
li a2, 10
li a7, WRITE
ecall
mv a0, s1 # low bits
jal ra, print_int
li a0, STDOUT
la a1, msg5
li a2, 1
li a7, WRITE
ecall
##------------------C---------------------------
li a0, STDOUT
la a1, msg5
li a2, 1
li a7, WRITE
ecall
li a0, STDOUT
la a1, testcase_c
li a2, 5
li a7, WRITE
ecall
li a0, STDOUT
la a1, msg5
li a2, 1
li a7, WRITE
ecall
li a0, STDOUT
la a1, msg1
li a2, 19
li a7, WRITE
ecall
li t4, 5 # n = 5
mv s2, x0
mv s3, x0
##------------cycle-------------------------------
call get_cycles
mv s4, a0 # low
mv s5, a1 # high
jal ra, printSteps
call get_cycles
mv t0, a0 # end low
mv t1, a1 # end high
# 計算差值
sub s1, t0, s4
sub s6, t1, s5
# 輸出 cycles
li a0, STDOUT
la a1, msg_cycles
li a2, 10
li a7, WRITE
ecall
mv a0, s1 # low bits
jal ra, print_int
li a0, STDOUT
la a1, msg5
li a2, 1
li a7, WRITE
ecall
end:
li a7, EXIT
ecall
printSteps:
bnez t4, L1
printSteps_0:
addi s2, s2, 1
# Print "{ "
li a0, STDOUT
la a1, msg2
li a2, 2
li a7, WRITE
ecall
# loop print steps
addi sp, sp, -4
sw ra, 0(sp)
li t0, 0
loop_0:
la a1,steps
slli t2, t0, 2
add t1, a1, t2
lw t3, 0(t1)
mv a0, t3
jal ra, print_int
li a0, STDOUT
la a1, msg4
li a2, 1
li a7, WRITE
ecall
addi t0, t0, 1
bltu t0, s3, loop_0
# Print "}\n"
li a0, STDOUT
la a1, msg3
li a2, 2
li a7, WRITE
ecall
lw ra, 0(sp)
addi sp, sp, 4
ret
L1:
li t0, 1
bltu t4, t0, R
addi sp, sp, -16
sw ra, 0(sp)
sw t4, 4(sp)
sw s3, 8(sp)
sw a1, 12(sp)
jal ra, printSteps_1
lw ra, 0(sp)
lw t4, 4(sp)
lw s3, 8(sp)
lw a1, 12(sp)
addi sp, sp, 16
L2:
li t0, 2
bltu t4, t0, R
addi sp, sp, -16
sw ra, 0(sp)
sw t4, 4(sp)
sw s3, 8(sp)
sw a1, 12(sp)
jal ra, printSteps_2
lw ra, 0(sp)
lw t4, 4(sp)
lw s3, 8(sp)
lw a1, 12(sp)
addi sp, sp, 16
R:
ret
#-----------------------------------------------
printSteps_1:
la a1,steps
slli t2, s3, 2
add t1, a1, t2
li t6, 1
sw t6, 0(t1)
addi t4, t4, -1
addi s3, s3, 1
addi sp, sp, -12
sw ra, 0(sp)
sw t4, 4(sp)
sw s3, 8(sp)
jal ra, printSteps
lw ra, 0(sp)
lw t4, 4(sp)
lw s3, 8(sp)
addi sp, sp, 12
ret
#-----------------------------------------------
printSteps_2:
la a1,steps
slli t2, s3, 2
add t1, a1, t2
li t6, 2
sw t6, 0(t1)
addi t4, t4, -2
addi s3, s3, 1
addi sp, sp, -12
sw ra, 0(sp)
sw t4, 4(sp)
sw s3, 8(sp)
jal ra, printSteps
lw ra, 0(sp)
lw t4, 4(sp)
lw s3, 8(sp)
addi sp, sp, 12
ret
#-----------------------------------------------
print_int:
addi sp, sp, -20
sw ra, 0(sp)
sw t0, 4(sp)
sw t1, 8(sp)
sw t2, 12(sp)
sw t3, 16(sp)
la t0, buffer # buffer 開頭
addi t1, t0, 10 # 結尾位置
sb x0, 0(t1) # 加上 '\0'
beqz a0, print_zero_I # 若為 0 特殊處理
print_int_loop_I:
# --- 計算 a0 / 10 與餘數 ---
li t3, 0 # 商 = 0
li t4, 0 # 餘數 = 0
li t2, 10
div_loop_I:
blt a0, t2, div_end_I # 若 a0 < 10,結束
addi a0, a0, -10 # 減 10
addi t3, t3, 1 # 商 + 1
j div_loop_I
div_end_I:
mv t4, a0 # 剩下的就是餘數
mv a0, t3 # 商成為新的 a0
addi t4, t4, 48 # 轉 ASCII
addi t1, t1, -1
sb t4, 0(t1)
bnez a0, print_int_loop_I
# --- 輸出字串 ---
li a0, 1 # STDOUT
mv a1, t1
la t2, buffer
addi t2, t2, 10
sub a2, t2, t1 # 長度
li a7, 64 # WRITE
ecall
j print_int_end_I
print_zero_I:
li t2, 48
sb t2, 0(t0)
li a0, 1
la a1, buffer
li a2, 1
li a7, 64
ecall
print_int_end_I:
lw ra, 0(sp)
lw t0, 4(sp)
lw t1, 8(sp)
lw t2, 12(sp)
lw t3, 16(sp)
addi sp, sp, 20
ret
```
## quiz2_main.c
```c
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#define printstr(ptr, length) \
do { \
asm volatile( \
"add a7, x0, 0x40;" \
"add a0, x0, 0x1;" /* stdout */ \
"add a1, x0, %0;" \
"mv a2, %1;" /* length character */ \
"ecall;" \
: \
: "r"(ptr), "r"(length) \
: "a0", "a1", "a2", "a7"); \
} while (0)
#define TEST_OUTPUT(msg, length) printstr(msg, length)
#define TEST_LOGGER(msg) \
{ \
char _msg[] = msg; \
TEST_OUTPUT(_msg, sizeof(_msg) - 1); \
}
extern uint64_t get_cycles(void);
extern uint64_t get_instret(void);
/* Bare metal memcpy implementation */
void *memcpy(void *dest, const void *src, size_t n)
{
uint8_t *d = (uint8_t *) dest;
const uint8_t *s = (const uint8_t *) src;
while (n--)
*d++ = *s++;
return dest;
}
/* Software division for RV32I (no M extension) */
static unsigned long udiv(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long quotient = 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
quotient |= (1UL << i);
}
}
return quotient;
}
static unsigned long umod(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
}
}
return remainder;
}
/* Simple integer to decimal string conversion */
static void print_dec(unsigned long val)
{
char buf[20];
char *p = buf + sizeof(buf) - 1;
*p = '\0';
p--;
if (val == 0) {
*p = '0';
p--;
} else {
while (val > 0) {
*p = '0' + umod(val, 10);
p--;
val = udiv(val, 10);
}
}
p++;
printstr(p, (buf + sizeof(buf) -1 - p));
}
unsigned char gray_code_trans(unsigned char n) {
return n ^ (n >> 1);
}
void hanoi_with_gray(int n, int* moves) {
unsigned char current, previous = 0;
for (int i = 1; i <= (1 << n) - 1; i++) {
current = gray_code_trans(i);
unsigned char diff = current ^ previous;
// 找出改變的位元位置
int disk = 0;
while ((diff & (1 << disk)) == 0) {
disk++;
}
int from_peg = moves[disk];
int to_peg;
if (disk == 0) {
// smallest disk
int small_shift = (umod((unsigned long)n, 2) == 0) ? 1 : 2;
to_peg = umod((unsigned long)(moves[0] + small_shift + 3), 3);
} else {
int tmp = 3 - moves[disk] - moves[0];
to_peg = umod((unsigned long)(tmp + 3), 3);
}
TEST_LOGGER("Move disk ");
char disk_letter = 'A' + disk;
TEST_OUTPUT(&disk_letter, 1);
TEST_LOGGER(" from peg ");
print_dec(from_peg + 1);
TEST_LOGGER(" to peg ");
print_dec(to_peg + 1);
TEST_LOGGER("\n");
moves[disk] = to_peg;
previous = current;
}
}
int main() {
uint64_t start_cycles, end_cycles, cycles_elapsed;
uint64_t start_instret, end_instret, instret_elapsed;
int n = 3;
//max_steps =2^n
int moves[8] = {0};
TEST_LOGGER("Moving ");
print_dec(n);
TEST_LOGGER(" disks using Gray code sequence:\n");
start_cycles = get_cycles();
start_instret = get_instret();
hanoi_with_gray(n, moves);
end_cycles = get_cycles();
end_instret = get_instret();
cycles_elapsed = end_cycles - start_cycles;
instret_elapsed = end_instret - start_instret;
TEST_LOGGER(" Cycles: ");
print_dec((unsigned long) cycles_elapsed);
TEST_LOGGER(" Instructions: ");
print_dec((unsigned long) instret_elapsed);
TEST_LOGGER("\n");
return 0;
}
```
## quiz2_hanoi_graycode.S
```s
.data
.align 2
msg_hdr: .string "Moving 3 disks using Gray code sequence:\n"
.align 2
moving_msg: .string "Move disk "
.align 2
msgA: .string "A"
.align 2
msgB: .string "B"
.align 2
msgC: .string "C"
.align 2
msg_from: .string " from peg "
.align 2
msg_to: .string " to peg "
.align 2
newline: .string "\n"
.align 2
msg_cycles: .string " Cycles: "
.align 2
buffer: .word 32 # temporary 1-byte buffer at buffer
.align 2
moves: .word 0,0,0,0,0,0,0,0 # 8 words, initialized to 0
.equ STDOUT, 1
.equ WRITE, 64
.equ EXIT, 93
.text
.align 2
.globl _start
_start:
la sp, __stack_top
call main
li a7, EXIT
li a0, 0
ecall
main:
# print header
li a0, STDOUT
la a1, msg_hdr
li a2, 41
li a7, WRITE
ecall
# n = 3
li s1, 3
##------------cycle-------------------------------
call get_cycles
mv s6, a0 # low
mv s11, a1 # high
hanoi:
# bound = (1 << n) - 1
li t0, 1
sll s4, t0, s1 # s4 = 1 << n
addi s4, s4, -1 # s4 = (1<<n) - 1
li t1, 1 # i = 1
li s2, 0 # previous = 0
hanoi_loop:
bgt t1, s4, hanoi_end
# current = gray_code_trans(i)
mv a0, t1
jal ra, gray_code_trans
mv s3, a0 # s3 = current
xor t2, s3, s2 # diff = current ^ previous
li t3, 0 #disk
find_disk:
li t4, 1
sll t4, t4, t3 # mask = 1 << t3
and t5, t2, t4
bnez t5, disk_found
addi t3, t3, 1
j find_disk
disk_found:
# t3 = disk index (0=A,1=B,2=C)
# load current pos of that disk
la t6, moves
slli t4, t3, 2
add t6, t6, t4
lw t0, 0(t6) # t0 = from_peg (0..2)
mv s5, t0 # save from peg in s5
# compute to_peg (t0 will be overwritten with target)
# if disk == 0 -> smallest disk rule
beq t3, x0, handle_small
# else (larger disk): to_peg = 3 - cur - moves[0]; then normalize to 0..2
la t6, moves
lw s7, 0(t6) # t7 = moves[0] (ref)
li s8, 3
sub s8, s8, t0 # t8 = 3 - cur
sub s8, s8, s7 # t8 = 3 - cur - ref
# normalize t8 into 0..2
blt s8, x0, sub_add3_larger
li s9, 3
blt s8, s9, store_target_large
sub s8, s8, s9
j store_target_large
sub_add3_larger:
addi s8, s8, 3
store_target_large:
mv t0, s8
j do_print_and_store
handle_small:
# small_shift = (n % 2 == 0) ? 1 : 2
andi t6, s1, 1 # t6 = n & 1
beq t6, x0, small_shift_is_one
li t6, 2
j small_shift_done
small_shift_is_one:
li t6, 1
small_shift_done:
add s8, t0, t6 # t8 = moves[0] + small_shift
li s9, 3
blt s8, s9, store_target_small
sub s8, s8, s9
store_target_small:
mv t0, s8
do_print_and_store:
# Print "Move disk "
li a0, STDOUT
la a1, moving_msg
li a2, 10
li a7, WRITE
ecall
# Print disk letter (A/B/C)
# choose correct label
li t6, 0
beq t3, t6, print_A
li t6, 1
beq t3, t6, print_B
# default -> C
li a0, STDOUT
la a1, msgC
li a2, 1
li a7, WRITE
ecall
j print_from_label
print_A:
li a0, STDOUT
la a1, msgA
li a2, 1
li a7, WRITE
ecall
j print_from_label
print_B:
li a0, STDOUT
la a1, msgB
li a2, 1
li a7, WRITE
ecall
print_from_label:
# Print " from peg "
li a0, STDOUT
la a1, msg_from
li a2, 10
li a7, WRITE
ecall
# print from peg number (1-based) as single char
li t6, 49 #'1'
add t6, t6, s5 # '1' + from (s5)
la s7, buffer
sb t6, 0(s7)
li a0, STDOUT
la a1, buffer
li a2, 1
li a7, WRITE
ecall
# Print " to peg "
li a0, STDOUT
la a1, msg_to
li a2, 8
li a7, WRITE
ecall
# print to peg number (1-based) as single char - t0 holds target
li t6, 49 #'1'
add t6, t6, t0
la s7, buffer
sb t6, 0(s7)
li a0, STDOUT
la a1, buffer
li a2, 1
li a7, WRITE
ecall
# newline
li a0, STDOUT
la a1, newline
li a2, 1
li a7, WRITE
ecall
# store moves[disk] = target (t0)
la t6, moves
slli t4, t3, 2
add t6, t6, t4
sw t0, 0(t6)
# prepare next step
mv s2, s3 # previous = current
addi t1, t1, 1
j hanoi_loop
hanoi_end:
call get_cycles
mv t0, a0 # end low
mv t1, a1 # end high
# 計算差值
sub s1, t0, s6
sub s6, t1, s11
# 輸出 cycles
li a0, STDOUT
la a1, msg_cycles
li a2, 10
li a7, WRITE
ecall
mv a0, s1 # low bits
jal ra, print_int
li a0, STDOUT
la a1, newline
li a2, 1
li a7, WRITE
ecall
li a7, EXIT
li a0, 0
ecall
gray_code_trans:
mv t0, a0
srli s10, a0, 1
xor a0, t0, s10
mv t0,x0
ret
print_int:
addi sp, sp, -20
sw ra, 0(sp)
sw t0, 4(sp)
sw t1, 8(sp)
sw t2, 12(sp)
sw t3, 16(sp)
la t0, buffer
addi t1, t0, 10
sb x0, 0(t1) # '\0'
beqz a0, print_zero_I
print_int_loop_I:
# --- 計算 a0 / 10 與餘數 ---
li t3, 0 # q = t3
li t4, 0 # rem = t4
li t2, 10
div_loop_I:
blt a0, t2, div_end_I
addi a0, a0, -10
addi t3, t3, 1 # q + 1
j div_loop_I
div_end_I:
mv t4, a0
mv a0, t3 # q = a0
addi t4, t4, 48 # ASCII
addi t1, t1, -1
sb t4, 0(t1)
bnez a0, print_int_loop_I
# --- 輸出字串 ---
li a0, 1 # STDOUT
mv a1, t1
la t2, buffer
addi t2, t2, 10
sub a2, t2, t1
li a7, 64 # WRITE
ecall
j print_int_end_I
print_zero_I:
li t2, 48
sb t2, 0(t0)
li a0, 1
la a1, buffer
li a2, 1
li a7, 64
ecall
print_int_end_I:
lw ra, 0(sp)
lw t0, 4(sp)
lw t1, 8(sp)
lw t2, 12(sp)
lw t3, 16(sp)
addi sp, sp, 20
ret
```
## quiz3_main.c
```c
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#define SCALE (16)
#define printstr(ptr, length) \
do { \
asm volatile( \
"add a7, x0, 0x40;" \
"add a0, x0, 0x1;" /* stdout */ \
"add a1, x0, %0;" \
"mv a2, %1;" /* length character */ \
"ecall;" \
: \
: "r"(ptr), "r"(length) \
: "a0", "a1", "a2", "a7"); \
} while (0)
#define TEST_OUTPUT(msg, length) printstr(msg, length)
#define TEST_LOGGER(msg) \
{ \
char _msg[] = msg; \
TEST_OUTPUT(_msg, sizeof(_msg) - 1); \
}
extern uint64_t get_cycles(void);
extern uint64_t get_instret(void);
static unsigned long umod(unsigned long dividend, unsigned long divisor);
static unsigned long udiv(unsigned long dividend, unsigned long divisor);
static uint64_t mul32(uint32_t a, uint32_t b)
{
uint64_t result = 0;
for (int i = 0; i < 32; i++) {
if (b & (1U << i)) {
result += (uint64_t)a << i;
}
}
return result;
}
static const uint16_t rsqrt_table[32] = {
65535U, 46341U, 32768U, 23170U, 16384U,
11585U, 8192U, 5793U, 4096U, 2896U,
2048U, 1448U, 1024U, 724U, 512U, 362U,
256U, 181U, 128U, 90U, 64U, 45U,
32U, 23U, 16U, 11U, 8U, 6U, 4U, 3U, 2U, 1U
};
static int clz(uint32_t x){
if (!x) return 32;
int n = 0;
if ((x & 0xFFFF0000) == 0) { n += 16; x <<= 16; }
if ((x & 0xFF000000) == 0) { n += 8; x <<= 8; }
if ((x & 0xF0000000) == 0) { n += 4; x <<= 4; }
if ((x & 0xC0000000) == 0) { n += 2; x <<= 2; }
if ((x & 0x80000000) == 0) { n += 1; }
return n;
}
uint32_t fast_rsqrt(uint32_t x)
{
if (x == 0) return 0xFFFFFFFF; /* saturate */
if (x == 1) return (1U << SCALE); /* 1.0 in Q16 */
/* exponent = floor(log2(x)) */
int exp = 31 - clz(x);
uint32_t y = rsqrt_table[exp]; /* Q16 initial */
/* linear interpolation for better initial guess */
if (x > (1u << exp)) {
uint32_t y1 = (exp < 31) ? rsqrt_table[exp + 1] : 0;
uint32_t diff = (y > y1) ? (y - y1) : 0;
uint32_t frac = (uint32_t)((((uint64_t)x - (1ULL << exp)) << SCALE) >> exp); /* Q16 fraction */
y -= (uint32_t)( (mul32(diff, frac)) >> SCALE ); /* keep Q16 */
}
//* Newton iterations (2 times)
for (int i = 0; i < 2; i++) {
uint64_t y2 = mul32(y, y); /* Q32 */
uint32_t y2_hi = (uint32_t)(y2 >> 16); /* Q16 */
uint64_t xy2 = mul32(x, y2_hi); /* A_Q16 */
uint64_t three_q16 = (uint64_t)3U << SCALE; /* 3 in Q16 */
/* compute B_Q16 = ((3<<16) - xy2) >> 1 (still fits <= 32 bits for valid inputs) */
uint64_t tmp;
if (xy2 >= three_q16) tmp = 0; else tmp = (three_q16 - xy2) >> 1;
uint32_t B_q16 = (uint32_t)tmp;
y = (uint32_t)( mul32(y, B_q16) >> SCALE ); /* Q16 */
}
return y;
}
uint32_t fast_distance_3d(int32_t dx, int32_t dy, int32_t dz)
{
/* absolute values */
dx = (dx < 0) ? -dx : dx;
dy = (dy < 0) ? -dy : dy;
dz = (dz < 0) ? -dz : dz;
/* sum of squares (use 64-bit) */
uint64_t dist_sq = mul32((uint32_t)dx, (uint32_t)dx) +
mul32((uint32_t)dy, (uint32_t)dy) +
mul32((uint32_t)dz, (uint32_t)dz);
uint32_t scaled_dist_sq;
int shift = 0;
if (dist_sq > 0xFFFFFFFFULL) {
scaled_dist_sq = (uint32_t)(dist_sq >> 16); /* scale down */
shift = 8; /* will multiply result by 2^8 later */
} else {
scaled_dist_sq = (uint32_t)dist_sq;
}
uint32_t inv_sqrt = fast_rsqrt(scaled_dist_sq); /* Q16 */
uint64_t result = mul32(scaled_dist_sq, inv_sqrt) >> SCALE; /* back to Q0 (integer) */
if (shift) result <<= shift;
return (result > UINT32_MAX) ? UINT32_MAX : (uint32_t)result;
}
/* integer sqrt (bitwise) */
static uint32_t my_sqrt(uint32_t x) {
if (x == 0) return 0;
uint32_t result = 0;
uint32_t bit = 1U << 30;
while (bit > x) bit >>= 2;
while (bit != 0) {
if (x >= result + bit) {
x -= result + bit;
result = (result >> 1) + bit;
} else {
result >>= 1;
}
bit >>= 2;
}
return result;
}
static int32_t my_abs(int32_t x) {
int32_t mask = x >> 31;
return (x + mask) ^ mask;
}
static uint32_t f_abs(uint32_t x, uint32_t y) {
return (x >= y) ? (x - y) : (y - x);
}
/* Simple integer to decimal string conversion */
static void print_dec_no_nl(unsigned long val)
{
char buf[20];
char *p = buf + sizeof(buf) - 1;
*p = '\0';
p--;
if (val == 0) {
*p = '0';
p--;
} else {
while (val > 0) {
*p = '0' + (char)umod(val, 10);
p--;
val = udiv(val, 10);
}
}
p++;
printstr(p, (size_t)(buf + sizeof(buf) - p - 1)); /* exclude terminating '\0' */
}
/* print decimal with fixed width (pad with '0') */
static void print_dec_pad(unsigned long val, int width)
{
char buf[32];
for (int i = width - 1; i >= 0; --i) {
buf[i] = '0' + (char)umod(val, 10);
val = udiv(val, 10);
}
printstr(buf, (size_t)width);
}
/* keep original print_dec as newline-terminated convenience */
static void print_dec(unsigned long val)
{
print_dec_no_nl(val);
TEST_LOGGER("\n");
}
void *memcpy(void *dest, const void *src, size_t n)
{
uint8_t *d = (uint8_t *) dest;
const uint8_t *s = (const uint8_t *) src;
while (n--)
*d++ = *s++;
return dest;
}
static uint32_t umul(uint32_t a, uint32_t b)
{
uint32_t result = 0;
while (b) {
if (b & 1)
result += a;
a <<= 1;
b >>= 1;
}
return result;
}
static unsigned long umod(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
}
}
return remainder;
}
/* Software division for RV32I (no M extension) */
static unsigned long udiv(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long quotient = 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
quotient |= (1UL << i);
}
}
return quotient;
}
/* 64-bit divide by 32-bit (shift-subtract) -> returns 64-bit quotient */
static uint64_t udiv64(uint64_t dividend, uint32_t divisor)
{
if (divisor == 0) return (uint64_t)-1;
uint64_t quotient = 0;
uint64_t rem = 0;
for (int i = 63; i >= 0; i--) {
rem = (rem << 1) | ((dividend >> i) & 1ULL);
if (rem >= divisor) {
rem -= divisor;
quotient |= (1ULL << i);
}
}
return quotient;
}
int main(void)
{
uint64_t start_cycles, end_cycles, cycles_elapsed;
uint64_t start_instret, end_instret, instret_elapsed;
TEST_LOGGER("=== Part 1: Reciprocal Square Root Comparison ===\n\n");
int testcase[3] = {4, 2000, 50000};
for (int i = 0; i < 3; i++) {
start_cycles = get_cycles();
start_instret = get_instret();
uint32_t x = (uint32_t)testcase[i];
uint32_t y_fixed = fast_rsqrt(x); /* Q16 */
end_cycles = get_cycles();
end_instret = get_instret();
cycles_elapsed = end_cycles - start_cycles;
instret_elapsed = end_instret - start_instret;
/* format fast (Q16) into int.frac(4 digits) using umul */
uint32_t y_int = y_fixed >> SCALE;
uint32_t mask = (1u << SCALE) - 1;
uint64_t yprod = mul32((uint32_t)(y_fixed & mask), 10000); /* 64-bit product */
uint32_t y_frac4 = (uint32_t)(yprod >> SCALE);
// compute expected in Q16:
uint32_t sqrt_x_q8 = my_sqrt(x << SCALE);
uint32_t expected_fixed = (sqrt_x_q8 == 0) ? 0 : (uint32_t)udiv((1U << (SCALE + 8)), sqrt_x_q8);
uint64_t eprod = mul32(expected_fixed & mask, 10000);
uint32_t exp_int = expected_fixed >> SCALE;
uint32_t exp_frac4 = (uint32_t)(eprod >> SCALE);
uint32_t diff_fixed = f_abs(expected_fixed, y_fixed);
/* error_milli = (diff_fixed * 100000) / expected_fixed, use mul32 + udiv64 */
uint64_t prod = mul32(diff_fixed, 100000U);
uint64_t error_milli = (expected_fixed == 0) ? 0 : udiv64(prod, expected_fixed);
uint32_t error_int = (uint32_t)(error_milli / 1000ULL);
uint32_t error_frac3 = (uint32_t)(error_milli % 1000ULL);
TEST_LOGGER("x = ");
print_dec_no_nl(x);
TEST_LOGGER(" fast = ");
print_dec_no_nl(y_int);
TEST_LOGGER(".");
print_dec_pad(y_frac4, 4);
TEST_LOGGER(" expected = ");
print_dec_no_nl(exp_int);
TEST_LOGGER(".");
print_dec_pad(exp_frac4, 4);
TEST_LOGGER(" error = ");
print_dec_no_nl(error_int);
TEST_LOGGER(".");
print_dec_pad(error_frac3, 3);
TEST_LOGGER("%\n");
TEST_LOGGER(" Cycles: ");
print_dec((unsigned long)cycles_elapsed);
TEST_LOGGER(" Instructions: ");
print_dec((unsigned long)instret_elapsed);
TEST_LOGGER("\n");
}
TEST_LOGGER("\n=== Part 2: 3D Distance Approximation (scaled correctly) ===\n\n");
int dxs[3] = {3, 10, 100};
int dys[3] = {4, 20, 200};
int dzs[3] = {12, 30, 300};
for (int i = 0; i < 3; i++) {
start_cycles = get_cycles();
start_instret = get_instret();
uint32_t fast = fast_distance_3d(dxs[i], dys[i], dzs[i]);
end_cycles = get_cycles();
end_instret = get_instret();
cycles_elapsed = end_cycles - start_cycles;
instret_elapsed = end_instret - start_instret;
/* sum_squares using umul */
uint32_t s0 = umul((uint32_t)dxs[i], (uint32_t)dxs[i]);
uint32_t s1 = umul((uint32_t)dys[i], (uint32_t)dys[i]);
uint32_t s2 = umul((uint32_t)dzs[i], (uint32_t)dzs[i]);
uint32_t sum_squares = s0 + s1 + s2;
uint32_t real = my_sqrt(sum_squares);
uint32_t diff = (fast > real) ? (fast - real) : (real - fast);
/* error = (diff * 100) / real -> use umul + udiv */
uint32_t pct_num = umul(diff, 100U);
uint32_t error = (real == 0) ? 0 : (uint32_t)udiv(pct_num, real);
TEST_LOGGER("x = ");
TEST_LOGGER("(");
print_dec_no_nl((unsigned long)dxs[i]);
TEST_LOGGER(",");
print_dec_no_nl((unsigned long)dys[i]);
TEST_LOGGER(",");
print_dec_no_nl((unsigned long)dzs[i]);
TEST_LOGGER(") fast = ");
print_dec_no_nl(fast);
TEST_LOGGER(" expected = ");
print_dec_no_nl(real);
TEST_LOGGER(" error = ");
print_dec_no_nl(error);
TEST_LOGGER("%\n");
TEST_LOGGER(" Cycles: ");
print_dec((unsigned long)cycles_elapsed);
TEST_LOGGER(" Instructions: ");
print_dec((unsigned long)instret_elapsed);
TEST_LOGGER("\n");
}
return 0;
}
```