---
title: 'Assignment 2: Complete Applications'
disqus: hackmd
---
Assignment 2: Complete Applications
===
>[!Note] AI tools usage
>I use ChatGPT to assist with Quiz 2 by providing code explanations, grammar revisions, pre-work research, code summaries, and explanations of standard RISC-V instruction usage.
> Github code for this Assigment Homework: [Link](https://github.com/scarlett910/ca2025-hw2)
## Pick one program in Homework 1
#### 1. Program Overview
##### **1.1. UF8 Encoding/Decoding**
The uf8 datatype is an 8-bit floating-point-like format with 4-bit mantissa and 4-bit exponent.
🔹```uf8_decode(uf8 fl)```
Converts an 8-bit uf8 value to a 32-bit integer.
🔹```uf8_encode(uint32_t value)```
Converts a 32-bit integer to the uf8 format.
* Uses count leading zeros (CLZ) to estimate the exponent.
* Performs iterative adjustment to ensure correct round-trip encoding.
The C code to run bare-metal:
```c=
#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>
#include "rsqrt.h"
#define printstr(ptr, length) \
do { \
asm volatile( \
"add a7, x0, 0x40;" \
"add a0, x0, 0x1;" /* stdout */ \
"add a1, x0, %0;" \
"mv a2, %1;" /* length character */ \
"ecall;" \
: \
: "r"(ptr), "r"(length) \
: "a0", "a1", "a2", "a7"); \
} while (0)
#define TEST_OUTPUT(msg, length) printstr(msg, length)
#define TEST_LOGGER(msg) \
{ \
char _msg[] = msg; \
TEST_OUTPUT(_msg, sizeof(_msg) - 1); \
}
/* Bare metal memcpy implementation */
void *memcpy(void *dest, const void *src, size_t n)
{
uint8_t *d = (uint8_t *) dest;
const uint8_t *s = (const uint8_t *) src;
while (n--)
*d++ = *s++;
return dest;
}
/* Software division for RV32I (no M extension) */
unsigned long udiv(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long quotient = 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
quotient |= (1UL << i);
}
}
return quotient;
}
static unsigned long umod(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
}
}
return remainder;
}
/* Software multiplication for RV32I (no M extension) */
uint32_t umul(uint32_t a, uint32_t b)
{
uint32_t result = 0;
while (b) {
if (b & 1)
result += a;
a <<= 1;
b >>= 1;
}
return result;
}
/* Provide __mulsi3 for GCC */
uint32_t __mulsi3(uint32_t a, uint32_t b)
{
return umul(a, b);
}
/* Provide __udivsi3 for GCC - used by rsqrt.c */
uint32_t __udivsi3(uint32_t dividend, uint32_t divisor)
{
return (uint32_t)udiv(dividend, divisor);
}
/* Simple integer to hex string conversion */
void print_hex(unsigned long val)
{
char buf[20];
char *p = buf + sizeof(buf) - 1;
*p = '\n';
p--;
if (val == 0) {
*p = '0';
p--;
} else {
while (val > 0) {
int digit = val & 0xf;
*p = (digit < 10) ? ('0' + digit) : ('a' + digit - 10);
p--;
val >>= 4;
}
}
p++;
printstr(p, (buf + sizeof(buf) - p));
}
/* Simple integer to decimal string conversion */
void print_dec(unsigned long val)
{
char buf[20];
char *p = buf + sizeof(buf) - 1;
*p = '\n';
p--;
if (val == 0) {
*p = '0';
p--;
} else {
while (val > 0) {
*p = '0' + umod(val, 10);
p--;
val = udiv(val, 10);
}
}
p++;
printstr(p, (buf + sizeof(buf) - p));
}
/* ================================================
* HOMEWORK 1: UF8 IMPLEMENTATION
* ================================================ */
typedef uint8_t uf8;
static inline unsigned clz(uint32_t x) {
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) {
n -= c;
x = y;
}
c >>= 1;
} while (c);
return n - x;
}
uint32_t uf8_decode(uf8 fl) {
uint32_t mantissa = fl & 0x0f;
uint8_t exponent = fl >> 4;
uint32_t offset = (0x7FFF >> (15 - exponent)) << 4;
return (mantissa << exponent) + offset;
}
uf8 uf8_encode(uint32_t value) {
if (value < 16)
return value;
int lz = clz(value);
int msb = 31 - lz;
uint8_t exponent = 0;
uint32_t overflow = 0;
if (msb >= 5) {
exponent = msb - 4;
if (exponent > 15)
exponent = 15;
for (uint8_t e = 0; e < exponent; e++)
overflow = (overflow << 1) + 16;
while (exponent > 0 && value < overflow) {
overflow = (overflow - 16) >> 1;
exponent--;
}
}
while (exponent < 15) {
uint32_t next_overflow = (overflow << 1) + 16;
if (value < next_overflow)
break;
overflow = next_overflow;
exponent++;
}
uint8_t mantissa = (value - overflow) >> exponent;
return (exponent << 4) | mantissa;
}
static bool test_uf8_roundtrip(void) {
int32_t previous_value = -1;
bool passed = true;
for (int i = 0; i < 256; i++) {
uint8_t fl = i;
int32_t value = uf8_decode(fl);
uint8_t fl2 = uf8_encode(value);
if (fl != fl2) {
TEST_LOGGER("FAIL: "); print_hex(fl);
TEST_LOGGER(" produces value "); print_dec(value);
TEST_LOGGER(" but encodes back to "); print_hex(fl2);
passed = false;
}
if (value <= previous_value) {
TEST_LOGGER("FAIL: "); print_hex(fl);
TEST_LOGGER(" value "); print_dec(value);
TEST_LOGGER(" <= previous_value "); print_dec(previous_value);
passed = false;
}
previous_value = value;
}
return passed;
}
int main(void) {
TEST_LOGGER("=== Start Test Suite for Homework 1: UF8 ===\n");
bool all_passed_uf8 = test_uf8_roundtrip();
if (all_passed_uf8) {
TEST_LOGGER("== Homework 1: All tests PASSED ==\n");
} else {
TEST_LOGGER("== Homework 1: FAILED ==\n");
}
return 0;
}
```
##### **1.2. Test Suite**
Iterates all 256 uf8 values to check:
* Round-trip correctness: uf8_encode(uf8_decode(x)) == x
* Monotonicity: decoded values increase with uf8 values.
#### 2. Compilation and Execution
##### 2.1. Compilation Flags
To compile the .c file, I first tried to use the following command and keep the default Makefile in rv32emu.
```
cd rv32emu/my_hw2/system/playground
make clean
make run
```
There are just RV32I instructions that can be used so the I make sure the Makefile has the -march=rv32izicsr or -march=rv32i_zicsr flags.
```
Makefile
ARCH = -march=rv32izicsr
LINKER_SCRIPT = linker.ld
EMU ?= ../../../build/rv32emu
AFLAGS = -g $(ARCH)
CFLAGS = -g -march=rv32i_zicsr
LDFLAGS = -T $(LINKER_SCRIPT)
EXEC = test.elf
CC = $(CROSS_COMPILE)gcc
AS = $(CROSS_COMPILE)as
LD = $(CROSS_COMPILE)ld
OBJDUMP = $(CROSS_COMPILE)objdump
OBJS = start.o main.o
$(EXEC): $(OBJS) $(LINKER_SCRIPT)
$(LD) $(LDFLAGS) -o $@ $(OBJS)
```
#### 3. Result:
- All 256 uf8 values decoded and re-encoded correctly
- Values increase monotonically
```
=== Start Test Suite for Homework 1: UF8 ===
== Homework 1: All tests PASSED ==
Cycles: 70324
Instructions: 70324
```
## Adapt Problem A Quiz 2
#### 1. Overview:
This problem implements the Tower of Hanoi puzzle for 3 disks in RV32I assembly, using an iterative approach rather than recursion. Gray codes are used to determine which disk to move at each step.
**Key concepts**
* Tower of Hanoi Rules
* Move all disks from a source peg to a target peg.
* Only one disk may be moved at a time.
* A larger disk cannot be placed on top of a smaller disk.
* Gray Code Connection
* Gray codes are sequences where only one bit changes at a time between consecutive numbers.
* Assign each disk a bit:
```
Smallest disk → LSB (bit 0)
Next largest → bit 1
Largest → MSB (bit 2)
```
* The bit that flips indicates which disk moves.
* Iterative Algorithm Using Gray Codes
* Generate Gray code for move numbers: ```Gray(n) = n XOR (n >> 1)```
* Determine which disk moves by XORing the current Gray code with the previous one.
* Special rule for the smallest disk (disk A): it moves in a fixed cyclic pattern through the pegs.
* Other disks move based on the difference in Gray code values.
* Disk Position Tracking
* Each disk’s current peg is stored in memory.
* After computing the next move, the program updates the disk’s peg.
* Loop Structure
* A counter tracks the number of moves.
* Loop terminates after ```2^3 = 8``` moves for 3 disks.
* Each iteration calculates the Gray code, determines the disk to move, computes the destination peg, updates disk positions, and prints the move.
#### 2. Adaptation
> Original Tower of Hanoi Ripes code from the Quiz: [Link](https://hackmd.io/@sysprog/arch2025-quiz2-sol#Problem-A)
The original code are implemented for simulation in Ripes, but it doesn't work for the rv32emu bare-metal environment so I rewrite the Tower of Hanoi assembly code as follow:
```asm=
.text
.globl hanoi_asm
hanoi_asm:
addi x2, x2, -56
sw x8, 0(x2)
sw x9, 4(x2)
sw x18, 8(x2)
sw x19, 12(x2)
sw x20, 16(x2)
sw s2, 20(x2)
sw s3, 24(x2)
sw s4, 28(x2)
sw s5, 32(x2)
sw s6, 36(x2)
sw s7, 40(x2)
# 3 disk positions stored at offsets 44,48,52
sw x0, 44(x2)
sw x0, 48(x2)
sw x0, 52(x2)
li x8, 1 # iteration counter
game_loop:
# stop after 7 moves for 3 disks (2^3 - 1 = 7)
addi x5, x0, 8
beq x8, x5, finish_game
# Gray code n: g(n) = n XOR (n >> 1)
srli x5, x8, 1
xor x6, x8, x5
# Gray code n-1: g(n-1) = (n-1) XOR ((n-1)>>1)
addi x7, x8, -1
srli x28, x7, 1
xor x7, x7, x28
# changed bit = g(n) XOR g(n-1)
xor x5, x6, x7
# determine disk number
addi x9, x0, 0
andi x6, x5, 1
bne x6, x0, disk_found
addi x9, x0, 1
andi x6, x5, 2
bne x6, x0, disk_found
addi x9, x0, 2
disk_found:
# base position entry = 44 + disk*4
slli x5, x9, 2
addi x5, x5, 44
add x5, x2, x5
lw x18, 0(x5) # x18 = old source peg
# compute destination peg
bne x9, x0, handle_large
# disk 0 moves +2 mod 3
addi x19, x18, 2
li x6, 3
blt x19, x6, display_move
sub x19, x19, x6
jal x0, display_move
handle_large:
lw x6, 44(x2) # pos of largest disk
li x19, 3
sub x19, x19, x18
sub x19, x19, x6
display_move:
# x9 = disk index
# x18 = source peg index
# x19 = destination peg index
# convert peg index → ASCII ('A'+index)
addi t2, x18, 65 # source char
addi t3, x19, 65 # dest char
li a0, 1
la a1, str1
li a2, 10
li a7, 64
ecall
addi t0, x9, 1 # disk number 1..3
addi t0, t0, 48 # '0' + number
la t1, char_buffer
sb t0, 0(t1)
li a0, 1
la a1, char_buffer
li a2, 1
li a7, 64
ecall
li a0, 1
la a1, str2
li a2, 6
li a7, 64
ecall
la t1, char_buffer
sb t2, 0(t1)
li a0, 1
la a1, char_buffer
li a2, 1
li a7, 64
ecall
li a0, 1
la a1, str3
li a2, 4
li a7, 64
ecall
la t1, char_buffer
sb t3, 0(t1)
li a0, 1
la a1, char_buffer
li a2, 1
li a7, 64
ecall
li t0, 10
la t1, char_buffer
sb t0, 0(t1)
li a0, 1
la a1, char_buffer
li a2, 1
li a7, 64
ecall
# update disk position table
slli x5, x9, 2
addi x5, x5, 44
add x5, x2, x5
sw x19, 0(x5)
addi x8, x8, 1
jal x0, game_loop
finish_game:
lw x8, 0(x2)
lw x9, 4(x2)
lw x18, 8(x2)
lw x19, 12(x2)
lw x20, 16(x2)
lw s2, 20(x2)
lw s3, 24(x2)
lw s4, 28(x2)
lw s5, 32(x2)
lw s6, 36(x2)
lw s7, 40(x2)
addi x2, x2, 56
ret
.data
str1: .asciz "Move Disk "
str2: .asciz " from "
str3: .asciz " to "
char_buffer: .space 1
```
**Run in rv32emu bare-metal environment**
Update the main.c to compile the hanoi.s and also I add the get_cycles() and get_instret() to observe the performance.
```c=
extern uint64_t get_cycles(void);
extern uint64_t get_instret(void);
extern void hanoi_asm(void);
//=======================TOWER OF HANOI=======================
static bool test_hanoi_assembly(void) {
// Call the assembly function
// It will print the moves and return
hanoi_asm();
// If we reach here, the function completed
return true;
}
int main(void) {
uint64_t start_cycles, end_cycles, cycles_elapsed;
uint64_t start_instret, end_instret, instret_elapsed;
TEST_LOGGER("=== Start Test Suite for Quiz 2 Problem A: Tower of Hanoi ===\n");
start_cycles = get_cycles();
start_instret = get_instret();
bool all_passed_hanoi = test_hanoi_assembly();
end_cycles = get_cycles();
end_instret = get_instret();
cycles_elapsed = end_cycles - start_cycles;
instret_elapsed = end_instret - start_instret;
if (all_passed_hanoi) {
TEST_LOGGER("== Tower of Hanoi: All tests PASSED ==\n");
} else {
TEST_LOGGER("== Tower of Hanoi: FAILED ==\n");
}
TEST_LOGGER("Cycles: ");
print_dec((unsigned long) cycles_elapsed);
TEST_LOGGER("Instructions: ");
print_dec((unsigned long) instret_elapsed);
return 0;
}
```
Result after compile:

## Adapt Problem C Quiz 3
#### 1. Overview:
Implement a fast reciprocal square root function (rsqrt) that computes an approximation of $1/\sqrt{x}$ optimized for RISC-V RV32I architecture without hardware floating-point unit (FPU) or multiplication/division extensions (M-extension).
#### 2. Mathematical Background
The goal is to compute: $rsqrt(x) = 1/\sqrt{x}$
This operation is fundamental in computer graphics, physics simulations, and signal processing, particularly for:
* Vector normalization: $v̂ = v / |v| = v * (1/\sqrt{x² + y² + z²})$
* Distance calculations: distance = $\sqrt{x² + y² + z²}$
* Lighting computations: Normal vector calculations
**Key advantage:** Avoids expensive division by converting $x /\sqrt{y}$ into $x * (1/\sqrt{x})$, where multiplication is significantly faster than division on most architectures.
#### 3. Algorithm Components
The fast rsqrt implementation combines several techniques:
* Fixed-Point Representation (Q0.32 Format)
Since RV32I lacks hardware floating-point support, we use 16-bit fixed-point arithmetic:
* Format: Integer × 2^16 (scaled by 65536)
* Example: 3.5 → 3.5 × 65536 = 229376 (integer)
* Range: $[0, 1)$ for Q0.32, or $[0, 65536)$ for our 16-bit fractional format
* Operations: All arithmetic done on integers, then scaled appropriately
* Lookup Table
A 32-entry table provides initial approximations:
```c=
static const uint32_t rsqrt_table[32] = {
65536, 46341, 32768, 23170, 16384, // 2^0 to 2^4
11585, 8192, 5793, 4096, 2896, // 2^5 to 2^9
2048, 1448, 1024, 724, 512, // 2^10 to 2^14
362, 256, 181, 128, 90, // 2^15 to 2^19
64, 45, 32, 23, 16, // 2^20 to 2^24
11, 8, 6, 4, 3, // 2^25 to 2^29
2, 1 // 2^30 to 2^31
};
```
* Linear Interpolation
Lookup table alone is insufficient (only 32 discrete values). We interpolate between table entries for better accuracy.
Algorithm:
```c=
// Linear interpolation: y = y_base + (y_next - y_base) × fraction
uint32_t delta = y_base - y_next;
uint32_t adjustment = (delta × fraction) >> 16;
uint32_t y_initial = y_base - adjustment;
//Key optimization: Instead of expensive division (x - 2^exp) / 2^exp, use bit shift: >> exp
```
#### 4. Adaptation
> [**Source code**](https://github.com/scarlett910/ca2025-hw2/blob/main/rsqrt_not_optimized.c)
**Run in rv32emu bare-metal environment**
Update the main.c to compile the ```rsqrt.c```
```c=
#include "rsqrt.h"
/* Provide __udivsi3 for GCC - used by rsqrt.c */
uint32_t __udivsi3(uint32_t dividend, uint32_t divisor)
{
return (uint32_t)udiv(dividend, divisor);
}
void print_dec_inline(unsigned long val)
{
char buf[20];
char *p = buf + sizeof(buf);
*--p = '\0';
if (val == 0) {
*--p = '0';
} else {
while (val > 0) {
*--p = '0' + umod(val, 10);
val = udiv(val, 10);
}
}
printstr(p, (buf + sizeof(buf) - 1 - p)); // no newline
}
int main(void) {
uint64_t start_cycles, end_cycles, cycles_elapsed;
uint64_t start_instret, end_instret, instret_elapsed;
uint64_t start_ticks, end_ticks, ticks_elapsed;
TEST_LOGGER("\n=== Start Test Suite for Quiz 3 Problem C: Fast rsqrt ===\n");
// Test rsqrt accuracy
start_cycles = get_cycles();
start_instret = get_instret();
start_ticks = getticks();
bool rsqrt_passed = test_rsqrt_accuracy();
end_cycles = get_cycles();
end_instret = get_instret();
end_ticks = getticks();
cycles_elapsed = end_cycles - start_cycles;
instret_elapsed = end_instret - start_instret;
ticks_elapsed = end_ticks - start_ticks;
if (rsqrt_passed) {
TEST_LOGGER("== rsqrt accuracy: PASSED ==\n");
} else {
TEST_LOGGER("== rsqrt accuracy: FAILED ==\n");
}
bool distance_passed = test_fast_distance_3d();
if (distance_passed) {
TEST_LOGGER("== fast_distance_3d: PASSED ==\n");
} else {
TEST_LOGGER("== fast_distance_3d: FAILED ==\n");
}
TEST_LOGGER("Cycles (get_cycles): ");
print_dec((unsigned long) cycles_elapsed);
TEST_LOGGER("Cycles (getticks): ");
print_dec((unsigned long) ticks_elapsed);
TEST_LOGGER("Instructions: ");
print_dec((unsigned long) instret_elapsed);
TEST_LOGGER("\n=== All Tests Completed ===\n");
return 0;
}
```
Result after compile:

## Optimization of the program
To observe the performance changes according to each compilation options, I disassemble the ELF files produced by the C compiler with 3 options: -O0 (no optimization), -Ofast (optimized for speed) to -Os (optimized for size).
Create the ```all_multi``` in Makefile to compile different compiled ELF files:
```
Makefile
O_LEVELS = Ofast Os
all_multi: $(O_LEVELS:%=test_%.elf)
test_%.elf:
$(MAKE) clean
$(MAKE) CFLAGS="-g -$* -march=rv32i_zicsr"
mv test.elf test_$*.elf
```
#### 1. Difference observation
I disassembled the ELF files generated by the C compiler and they give different value as follow:
```
riscv-none-elf-objdump -d test.elf
riscv-none-elf-objdump -d test_Ofast.elf
riscv-none-elf-objdump -d test_Os.elf
```
The results:
```
test.elf: file format elf32-littleriscv
Disassembly of section .text:
00010000 <_start>:
10000: 00004117 auipc sp,0x4
10004: 8b010113 addi sp,sp,-1872 # 138b0 <__stack_top>
10008: 00003297 auipc t0,0x3
1000c: 8a828293 addi t0,t0,-1880 # 128b0 <__bss_end>
10010: 00003317 auipc t1,0x3
10014: 8a030313 addi t1,t1,-1888 # 128b0
...
```
```
test_Ofast.elf: file format elf32-littleriscv
Disassembly of section .text:
00010000 <_start>:
10000: 00003117 auipc sp,0x3
10004: c7010113 addi sp,sp,-912 # 12c70 <__stack_top>
10008: 00002297 auipc t0,0x2
1000c: c6028293 addi t0,t0,-928 # 11c68 <__bss_end>
10010: 00002317 auipc t1,0x2
10014: c5830313 addi t1,t1,-936 # 11c68
...
```
```
test_Os.elf: file format elf32-littleriscv
Disassembly of section .text:
00010000 <_start>:
10000: 00002117 auipc sp,0x2
10004: 0a010113 addi sp,sp,160 # 120a0 <__stack_top>
10008: 00001297 auipc t0,0x1
1000c: 09028293 addi t0,t0,144 # 11098 <__bss_end>
10010: 00001317 auipc t1,0x1
10014: 08830313 addi t1,t1,136 # 11098 <__bss_end>
...
```
After that, I run these elf files to observe the performance differences.
Ofast: 
Os:

O0:

As what we can observe, the cycles and instruction count for Ofast and Os are not much different from each other, I think this can come from some mistakes I made when I set up the compilation but for the non-optimized ELF file, we can see that the rsqrt program's cycles count for is a lot bigger than the optimized ones (94868 > 89534)
#### 2. Applying optimization methods:
Due to that phenomenon, I have handwritten and optimized again by using loop unrolling for clz() and peephole optimization for newton_step() in the the origninal rsqrt.c
Before optimization:
```c=
//clz() using looping
static inline unsigned clz(uint32_t x) {
if (x == 0) return 32;
unsigned n = 0;
uint32_t bit = 0x80000000;
while ((x & bit) == 0) {
n++;
bit >>= 1;
}
return n;
}
//rsqrt() call the newton_step() causing overhead
static inline uint32_t newton_step(uint32_t x, uint32_t y) {
uint64_t y64 = y;
uint64_t y2 = (y64 * y64) >> 16;
uint64_t xy2 = x * y2;
return (uint32_t)((y64 * ((3U << 16) - xy2)) >> 17);
}
/* RSQRT vá»›i newton_step() */
uint32_t rsqrt(uint32_t x) {
if (x == 0) return 0xFFFFFFFF;
uint32_t exp = 31 - clz(x);
uint32_t y_base = rsqrt_table[exp];
uint32_t y_next = (exp == 31) ? 1 : rsqrt_table[exp + 1];
uint32_t delta = y_base - y_next;
uint64_t frac_num_scaled = (uint64_t)(x - (1U << exp)) << 16;
uint32_t frac = (uint32_t)(frac_num_scaled >> exp);
uint32_t y = y_base - (uint32_t)(((uint64_t)delta * frac) >> 16);
y = newton_step(x, y);
y = newton_step(x, y);
return y;
}
```
Applying the optimization methods:
```c=
// loop unrolling for clz - newton_step
static inline unsigned clz(uint32_t x) {
if (x == 0) return 32;
unsigned n = 0;
if ((x & 0xFFFF0000) == 0) { n += 16; x <<= 16; }
if ((x & 0xFF000000) == 0) { n += 8; x <<= 8; }
if ((x & 0xF0000000) == 0) { n += 4; x <<= 4; }
if ((x & 0xC0000000) == 0) { n += 2; x <<= 2; }
if ((x & 0x80000000) == 0) { n += 1; }
return n;
}
// rsqrt with peephole optimization - Inline Newton iterations
uint32_t rsqrt(uint32_t x) {
if (x == 0) return 0xFFFFFFFF;
// Tìm exponent (MSB position)
uint32_t exp = 31 - clz(x);
// Lookup table
uint32_t y_base = rsqrt_table[exp];
uint32_t y_next = (exp == 31) ? 1 : rsqrt_table[exp + 1];
// Linear interpolation
uint32_t delta = y_base - y_next;
uint64_t frac_num_scaled = (uint64_t)(x - (1U << exp)) << 16;
uint32_t frac = (uint32_t)(frac_num_scaled >> exp);
uint32_t interp = (uint32_t)(((uint64_t)delta * frac) >> 16);
uint32_t y = y_base - interp;
// Newton iteration 1
uint32_t y2 = (uint32_t)(((uint64_t)y * y) >> 16);
uint32_t xy2 = (uint32_t)((uint64_t)x * y2);
y = (uint32_t)(((uint64_t)y * ((3U << 16) - xy2)) >> 17);
// Newton iteration 2
y2 = (uint32_t)(((uint64_t)y * y) >> 16);
xy2 = (uint32_t)((uint64_t)x * y2);
y = (uint32_t)(((uint64_t)y * ((3U << 16) - xy2)) >> 17);
return y;
}
```
The result I got after loop unrolling and peephole optimization for rsqrt.c:

#### 3. Optimazation for Tower of Hanoi:
The non-optimized compiled result return with the cycle count of 689, I tried to apply the "Dead code elimination" method to removes instructions that have no effect on the program's output. I remove register save/restore operations for registers that are never used in the function.
Original Code (11 registers saved, 56-byte stack):
```asm=
hanoi_asm:
addi x2, x2, -56 # 56-byte stack frame
sw x8, 0(x2)
sw x9, 4(x2)
sw x18, 8(x2)
sw x19, 12(x2)
sw x20, 16(x2) # dead code - x20 never used
sw s2, 20(x2) # dead code - s2 never used
sw s3, 24(x2) # dead code - s3 never used
sw s4, 28(x2) # dead code - s4 never used
sw s5, 32(x2) # dead code - s5 never used
sw s6, 36(x2) # dead code - s6 never used
sw s7, 40(x2) # dead code - s7 never used
```
Optimized Code (4 registers saved, 32-byte stack):
```asm=
hanoi_asm:
addi x2, x2, -32 # 32-byte stack frame (reduced from 56)
sw x8, 0(x2) # kept - used as iteration counter
sw x9, 4(x2) # kept - used for disk number
sw x18, 8(x2) # kept - used for source peg
sw x19, 12(x2) # kept - used for dest peg
# Removed 7 unnecessary sw instructions
```
I also apply this again for the function finish_game to remove 7 unnecessary lw instructions
Result after optimize hanoi.s:

The cycle count and instruction count reduce from 689 to 652.
## Reference
[Quiz2 of Computer Architecture (2025 Fall)](https://hackmd.io/@sysprog/arch2025-quiz2-sol)
[Quiz3 of Computer Architecture (2025 Fall)](https://hackmd.io/@sysprog/arch2025-quiz3-sol)
[Lab 2 RISC-V Instruction Set Simulator and System Emulator](https://hackmd.io/@sysprog/Sko2Ja5pel#Inspecting-Binaries-with-GNU-Toolchain)