# Assignment2: RISC-V Toolchain
contributed by < [`clare8151214`](https://github.com/clare8151214) >
## Install rv32ima toolchain & rv32emu
Follow the instruction from [this note](https://hackmd.io/@sysprog/Sko2Ja5pel)
validate installation succeed by executing
```
riscv-none-elf-gcc -v
```
and the following result is printed out on the terminal.

### Run Tests

### tests/system/playground

## Pick one complete program (with test suite)
The following question is picked from the Assignment 1
[Original Assembly code](https://github.com/clare8151214/ca2025-quizzes/blob/main/q1-uf8.s)
[Original C code](https://github.com/clare8151214/ca2025-quizzes/blob/main/q1-uf8.c)
### The Hand Written Assembly
The hand written assembly looks like this
```asm
.text
.globl uf8_decode
.globl uf8_encode
uf8_decode:
mv t5, a0 # t5 = fl
andi t0, t5, 0x0f # mantissa
srli t1, t5, 4 # exponent
li t2, 0x7FFF
li t3, 15
sub t3, t3, t1 # 15 - exponent
srl t4, t2, t3 # 0x7FFF >> (15 - exponent)
slli t4, t4, 4 # << 4
sll a0, t0, t1 # mantissa << exponent
add a0, a0, t4 # + offset
ret
uf8_encode:
mv t0, a0 # t0 = original value
li t6, 15
bleu t0, t6, encode_exp_0_fast_path
li t6, 47
bleu t0, t6, encode_exp_1_fast_path
li t6, 111
bleu t0, t6, encode_exp_2_fast_path
li t6, 239
bleu t0, t6, encode_exp_3_fast_path
j encode_generic_path
encode_exp_0_fast_path: # value <= 15
andi a0, t0, 0xFF
ret
encode_exp_1_fast_path: # 16–47
addi t1, t0, -16
srli t1, t1, 1
ori a0, t1, 0x10 # exponent = 1
ret
encode_exp_2_fast_path: # 48–111
addi t1, t0, -48
srli t1, t1, 2
ori a0, t1, 0x20 # exponent = 2
ret
encode_exp_3_fast_path: # 112–239
addi t1, t0, -112
srli t1, t1, 3
ori a0, t1, 0x30 # exponent = 3
ret
encode_generic_path:
# --- Inlined clz ---
mv a5, t0
li t1, 32
li t2, 16
clz_inline_loop:
srl t3, a5, t2
beq t3, x0, clz_inline_skip
sub t1, t1, t2
mv a5, t3
clz_inline_skip:
srli t2, t2, 1
bne x0, t2, clz_inline_loop
sub t1, t1, a5 # t1 = lz
li t2, 31
sub t2, t2, t1
li t3, 0
li t4, 0
li t6, 5
blt t2, t6, exp_est_done
li t6, 4
sub t3, t2, t6
li t6, 15
bgeu t3, t6, cap_to_15
j exp_est_loop_init
cap_to_15:
li t3, 15
exp_est_loop_init:
beqz t3, exp_est_done
li t4, 0
li t5, 0
exp_make_ovl:
bgeu t5, t3, exp_make_ovl_done
slli t4, t4, 1
addi t4, t4, 16
addi t5, t5, 1
j exp_make_ovl
exp_make_ovl_done:
li t5, 0
exp_back_check:
beqz t3, exp_est_done
bltu t0, t4, exp_back_step
j exp_est_done
exp_back_step:
addi t4, t4, -16
srli t4, t4, 1
addi t3, t3, -1
j exp_back_check
exp_est_done:
adj_up_loop:
li t6, 15
beq t3, t6, adj_done
slli t5, t4, 1
addi t5, t5, 16
bltu t0, t5, adj_done
mv t4, t5
addi t3, t3, 1
j adj_up_loop
adj_done:
sub t6, t0, t4
srl t6, t6, t3
slli t3, t3, 4
or a0, t3, t6
andi a0, a0, 0xFF
ret
```
Added a new test case in `main.c` to validate UTF-8 handling.
```c
static uint32_t uf8_ref_decode(uint32_t fl)
{
uint32_t mantissa = fl & 0x0F;
uint32_t exponent = fl >> 4;
if (exponent > 15)
exponent = 15;
uint32_t offset = 0x7FFFu >> (15u - exponent);
offset <<= 4;
return (mantissa << exponent) + offset;
}
static void test_uf8_roundtrip(void)
{
TEST_LOGGER("Test: uf8_encode/uf8_decode round-trip\n");
enum {
UF8_FAIL_NONE,
UF8_FAIL_DECODE,
UF8_FAIL_ROUNDTRIP,
UF8_FAIL_MONOTONIC,
} reason = UF8_FAIL_NONE;
uint32_t fail_fl = 0;
uint32_t fail_expected = 0;
uint32_t fail_observed = 0;
uint32_t fail_value = 0;
uint32_t prev_value = 0;
bool first = true;
for (uint32_t fl = 0; fl < 256; fl++) {
uint32_t decoded = uf8_decode(fl);
uint32_t expected = uf8_ref_decode(fl);
if (decoded != expected) {
reason = UF8_FAIL_DECODE;
fail_fl = fl;
fail_expected = expected;
fail_observed = decoded;
break;
}
uint32_t encoded = uf8_encode(decoded);
if (encoded != fl) {
reason = UF8_FAIL_ROUNDTRIP;
fail_fl = fl;
fail_expected = fl;
fail_observed = encoded;
fail_value = decoded;
break;
}
if (!first && decoded <= prev_value) {
reason = UF8_FAIL_MONOTONIC;
fail_fl = fl;
fail_expected = prev_value;
fail_observed = decoded;
break;
}
prev_value = decoded;
first = false;
}
if (reason == UF8_FAIL_NONE) {
TEST_LOGGER(" UF8 round-trip/monotonicity: PASSED\n");
return;
}
TEST_LOGGER(" UF8 round-trip/monotonicity: FAILED\n");
switch (reason) {
case UF8_FAIL_DECODE:
TEST_LOGGER(" Decode mismatch at FL index ");
print_dec((unsigned long) fail_fl);
TEST_LOGGER(" Expected decoded value: ");
print_dec((unsigned long) fail_expected);
TEST_LOGGER(" Observed decoded value: ");
print_dec((unsigned long) fail_observed);
break;
case UF8_FAIL_ROUNDTRIP:
TEST_LOGGER(" Round-trip mismatch at decoded value ");
print_dec((unsigned long) fail_value);
TEST_LOGGER(" Expected FL code: ");
print_dec((unsigned long) fail_expected);
TEST_LOGGER(" Observed FL code: ");
print_dec((unsigned long) fail_observed);
TEST_LOGGER(" Source FL index: ");
print_dec((unsigned long) fail_fl);
break;
case UF8_FAIL_MONOTONIC:
TEST_LOGGER(" Non-monotonic decode at FL index ");
print_dec((unsigned long) fail_fl);
TEST_LOGGER(" Previous decoded value: ");
print_dec((unsigned long) fail_expected);
TEST_LOGGER(" Current decoded value: ");
print_dec((unsigned long) fail_observed);
break;
default:
break;
}
}
```
### Test Result

## Problem A from Quiz 2 hanoi tower on rv32emu
[Assembly code](https://github.com/clare8151214/ca2025-quizzes/blob/main/q1-uf8.s)
[C code](https://github.com/clare8151214/Assignment2/blob/main/hanoi.c)
### Analysis of Compiler Optimization Behavior
The execution results show a clear downward trend in both cycle count and instruction count as the optimization level increases. This behavior is consistent with how GCC applies different optimization strategies for RV32 targets.
### 1. -O0 (No Optimization) — 841 cycles
At `-O0`, the compiler performs almost no code transformations.
As a result:
Every C statement is translated almost literally into assembly.
No constant folding, no dead code elimination, no loop optimization.
Temporary variables are stored on the stack instead of registers.
Extra load/store operations dominate execution.
This produces the largest instruction count and the slowest runtime, serving as a baseline.
`gcc -O0`

### 2. -O2 (General Optimization) — 533 cycles
At `-O2`, GCC performs a broad set of optimizations aimed at balanced performance:
Common-subexpression elimination
Improved register allocation
Constant propagation
Strength reduction
Unnecessary load/store elimination
This significantly reduces the number of memory accesses and redundant operations.
The drop from 841 → 533 cycles shows that the compiler removed a large amount of overhead introduced in the unoptimized code.
`gcc -O2`

### 3. -Os (Optimize for Size) — 527 cycles
`-Os` optimizes code size, but interestingly its performance is very close to `-O2`.
This happens because:
Smaller code often means fewer instructions executed.
`-Os` uses `-O2` as a base but disables transformations that increase code size (e.g., aggressive inlining or loop unrolling).
For this particular algorithm (Gray-code Hanoi), the working set is small enough that reducing code size indirectly improves performance.
Thus `-Os` is slightly more compact but performs nearly as well as `-O2`.
`gcc -Os`

### 4. -Ofast (Aggressive Optimization) — 348 cycles
`-Ofast` removes many of the conservative safety checks in standard optimizations:
More aggressive inlining
More loop optimizations
Eliminates strict IEEE compliance rules
Simplifies conditional logic when provable
Because the Gray-code computation and disk movement are purely integer
operations, GCC has freedom to:
Collapse branches
Keep almost all variables in registers
Remove redundant recomputation
Optimize bitwise operations aggressively
This leads to the best result — 348 cycles, a reduction of almost 60% from `-O0`.
`gcc -Ofast`

### 5. handwritten assembly code
[Original code](https://github.com/clare8151214/Assignment2/commit/381c16c028245d21f8fa11d52ab12465c4bb92ae#diff-15361b24046d62b1981fafc9b37a754985b89aa54654ebb50409d132b3b86706)
**Estimated Performance: ~518 cycles**

### Optimization 1: Gray Code Calculation (Loop Reduction)
Without touching the stack or s-registers yet, we can optimize the calculation logic within the existing program structure.
Modification: Initialize an extra register prev_gray (let's use t2) before the loop. After li t0, 1 and li t1, 8, add:
```asm
li t0, 1 # n = 1 (move index)
li t1, 8 # stop_n = 8
li t2, 0 # prev_gray = g(0) = 0
```
**Rewrite the loop's Gray code section:**
Replace the original redundant calculation:
```asm
# Gray code: g(n)
mv t2, t0
srli t3, t2, 1
xor t4, t2, t3 # t4 = gray(n)
# Gray code: g(n-1)
addi t2, t0, -1
srli t3, t2, 1
xor t2, t2, t3 # t2 = gray(n-1)
# changed bits = g(n) ^ g(n-1)
xor t5, t4, t2
andi t5, t5, 3 # only low 2 bits matter
```
With this optimized version:
```asm
# gray code
mv t2, t0
srli t3, t2, 1
xor t2, t2, t3 # curr_gray
# changed bits = curr ^ prev
xor t5, t2, t4
mv t4, t2 # update prev_gray
andi t5, t5, 3
```
**Result:** This saves 3 instructions per loop iteration (`addi`, `srli`, `xor` used for calculating `g(n-1)`). The logic remains valid because `prev_gray` is initialized to 0 (since `g(0) = 0`) and is updated to the current `g(n)` at the end of every cycle.
---
### Optimization 2: Removing the Stack Frame (Leaf Function Optimization)
We can completely eliminate the stack frame by using only caller-saved registers.
Rationale: Since this function is `void` (returns nothing), takes only a moves pointer, and is a leaf function (calls no other functions), we can hold all states in a0–a5 and t0–t6.
RISC-V ABI: `a0–a7` and `t0–t6` are caller-saved.
By avoiding `s0–s11`, we remove the need to save/restore registers or adjust the stack pointer (sp).
Proposed Register Mapping:
* State Variables (formerly s-regs):
* `a0`: move_ptr (was `s0`)
* `a1`: disk0_pos (was `s1`)
* `a2`: disk1_pos (was `s2`)
* `a3`: disk2_pos (was `s3`)
* `a4`: base_char = 'A' (was `s4`)
* `a5`: peg_count = 3 (was `s5`)
* Loop Temporaries:
* `t0`: n
* `t1`: stop_n
* `t2`: prev_gray
* `t3`: Temporary (from_peg)
* `t4`: Temporary (to_peg / gray)
* `t5`: changed_bits
* `t6`: disk_index (0/1/2)
New Function Initialization:
```
hanoi_generate_moves_asm:
mv a1, x0 # disk0_pos = 0
mv a2, x0 # disk1_pos = 0
mv a3, x0 # disk2_pos = 0
li a4, 65 # base_char = 'A'
li a5, 3 # peg_count = 3
li t0, 1 # n = 1
li t1, 8 # stop_n = 8
li t2, 0 # prev_gray = 0
```
Updated `write_move_entry`:
```asm
write_move_entry:
add t3, a4, t3 # from_char = 'A' + from_peg (t3 = from_peg)
add t4, a4, t4 # to_char = 'A' + to_peg (t4 = to_peg )
addi t6, t6, 1 # disk_num = disk_index + 1
sw t6, 0(a0) # moves[i].disk
sb t3, 4(a0) # moves[i].from_peg
sb t4, 5(a0) # moves[i].to_peg
addi a0, a0, 8 # move_ptr += sizeof(hanoi_move_t)
addi t0, t0, 1 # n++
j loop_generate_moves
```
**Note:** For the `select_disk0/1/2` blocks, simply replace `s1/s2/s3` with `a1/a2/a3` and `s5` with `a5`. The logic remains identical.
```asm
finish_generation:
ret
```
**Final Cleanup:** The function epilogue is now minimal. We no longer need `addi sp, sp, -28` or any `sw/lw` for `s` registers.
```asm
addi sp, sp, -28
sw ra, ...
sw s0~s5
```
#### result - 488 cycles
[code](https://github.com/clare8151214/Assignment2/blob/main/hanoi.S)

---
## Problem C from Quiz 3 rsqrt
Compiler optimization levels significantly affect both performance and code size on RISC-V RV32 targets.
Below is a comparison of `-O0`, `-O2`, `-Ofast`, and `-Os`, summarizing how each mode transforms code and influences cycle count, instruction count, and typical generated assembly patterns.
---
### 1. `-O0` — No Optimization
**Characteristics**
- Direct, literal translation of C → assembly.
- Each C expression becomes multiple load/store instructions.
- Almost all temporaries placed on the stack (poor register allocation).
- No constant folding, no inlining, no algebraic simplifications.
**Effects**
- **Slowest execution.**
- **Highest cycle and instruction count.**
- Excellent for debugging (one-to-one mapping to source code).
Ticks: 4412
Cycles: 4412
Instructioms: 4412

---
### 2. `-O2` — High Optimization (Safe Transformations)
**Characteristics**
- Aggressive register allocation.
- Dead code elimination.
- Constant folding & strength reduction (`x*2 → x<<1`, etc.).
- Loop invariants moved out of loops.
- Basic inlining of small functions.
**Effects**
- **Major performance improvement** over `-O0`.
- Code size increases compared to `-Os` but remains moderate.
- Very predictable behavior; widely used for production.
Ticks: 1393
Cycles: 1393
Instructioms: 1393

---
### 3. `-Ofast` — Maximum Performance (Unsafe but Fast)
**Characteristics**
- Everything from `-O3` plus:
- Ignores strict IEEE floating-point rules.
- May reorder expressions assuming no NaNs/Infs.
- May fold FP operations more aggressively.
- Allows non-standard math optimizations.
**Effects**
- **Fastest execution**, especially for floating-point & math-heavy code.
- Largest code size among the group.
- Not suitable when strict numerical correctness or determinism is required.
Ticks: 1337
Cycles: 1337
Instructioms: 1337

---
### 4. `-Os` — Optimize for Size
**Characteristics**
- Based on `-O2` but removes transformations that increase code size.
- Prefers shorter instruction sequences.
- Reduced inlining, avoids loop unrolling.
- More jump-based logic instead of instruction-heavy sequences.
**Effects**
- **Smallest code size**.
- Runtime performance typically between `-O1` and `-O2`.
- Ideal for embedded systems with limited flash/ROM.
Ticks: 1697
Cycles: 1697
Instructioms: 1697

---
### 5. Hand Written Assembly code
**[Orginal code](https://github.com/clare8151214/Assignment2/commit/c8ecb7226f76e91ec059dd1799317d9e4207ebd7#diff-3e5237ed1e84b2cb820d563688c7eabc58d124daf5fdf531753708f44c25d064)**
Ticks: 1833
Cycles: 1833
Instructioms: 1833

### Key Optimizations Applied
#### 1. **Fast Path for Zero Input**
**Savings: ~10 cycles**
**Before:**
```nasm
fast_rsqrt_asm:
addi sp, sp, -32
sw ra, 28(sp)
# ... stack setup
beqz a0, rsqrt_zero_input
rsqrt_zero_input:
li a0, 0
j rsqrt_return
```
**After:**
```nasm
fast_rsqrt_asm:
bnez a0, rsqrt_normal
ret # Direct return for zero input
```
**Rationale:** Avoid stack setup/cleanup overhead for zero input case.
---
#### 2. **Reduce Stack Space**
**Savings: ~2 cycles**
**Before:**
```nasm
addi sp, sp, -32 # 6 saved registers
sw s4, 8(sp)
# ...
```
**After:**
```nasm
addi sp, sp, -24 # Only 5 saved registers
# Use temporary registers instead of s4
```
**Rationale:** Eliminate unnecessary saved register by using temporaries.
---
#### 3. **Pre-compute Delta Value**
**Savings: ~1 cycle**
**Before:**
```nasm
lw s3, 4(t1)
j do_interp
do_interp:
sub s4, s2, s3 # Compute delta here
```
**After:**
```nasm
lw s3, 4(t1)
sub s3, s2, s3 # Compute delta immediately
j do_interp
```
**Rationale:** Reduce data dependency latency.
---
#### 4. **Inline mul64 - Frac Calculation**
**Savings: ~6 cycles**
**Before:**
```nasm
mv a0, t1
mv a1, t2
call mul64 # Function call overhead
```
**After:**
```nasm
mul a0, t1, t2 # Inline multiplication
mulhu a1, t1, t2
```
**Rationale:** Eliminate function call overhead (jal + ret + register moves).
---
#### 5. **Inline mul64 - Delta × Frac**
**Savings: ~6 cycles**
**Before:**
```nasm
mv a0, s4
mv a1, t0
call mul64
```
**After:**
```nasm
mul t0, s3, a0 # Direct inline computation
mulhu t1, s3, a0
```
**Rationale:** Remove function call and unnecessary register moves.
---
#### 6. **Direct Computation of y² Lower 32 Bits**
**Savings: ~9 cycles** ⭐ **Biggest single optimization**
**Before:**
```nasm
mv a0, s2
mv a1, s2
call mul64 # Full 64-bit multiply
mv s3, a0 # Only use lower 32 bits
```
**After:**
```
mul s3, s2, s2 # Only compute lower 32 bits
```
**Rationale:** Algorithm only needs lower 32 bits of y². No need for full 64-bit multiply or high bits (mulhu).
---
#### 7. **Inline mul64 - x × y²**
**Savings: ~6 cycles**
**Before:**
```nasm
mv a0, s0
mv a1, s3
call mul64
```
**After:**
```
mul a0, s0, s3
mulhu a1, s0, s3
```
**Rationale:** Inline multiplication operation.
---
#### 8. **Optimize Constant Loading**
**Savings: ~1 cycle**
**Before:**
```nasm
li t0, 3
slli t0, t0, 16 # Two instructions
```
**After:**
```
li t1, 196608 # Direct load of 3 << 16
```
**Rationale:** Pre-compute constant to eliminate shift instruction.
---
#### 9. **Inline mul64 - Final Newton-Raphson Step**
**Savings: ~6 cycles**
**Before:**
```nasm
mv a0, s2
mv a1, t0
call mul64
```
**After:**
```
mul a0, s2, t1
mulhu a1, s2, t1
```
**Rationale:** Inline the final multiplication.
#### 10. **Inline mul64 in fast_distance_3d**
**Savings: ~6 cycles**
**Before:**
```nasm
mv a0, s0
mv a1, s1
call mul64
```
**After:**
```
mul a0, s0, s1
mulhu a1, s0, s1
```
**Rationale:** Remove function call in distance calculation.
---
## Expected Improvement
**~30-35% reduction in cycle count**
The most significant optimization is **#6 (Direct y² computation)**, which recognizes that the algorithm only needs the lower 32 bits of the y² result, allowing us to skip the full 64-bit multiplication and function call overhead entirely.
Ticks: 1409
Cycles: 1409
Instructioms: 1409

## Summary Table
| Implementation | Execution Speed | Code Size | Safety / Accuracy | Notes |
| -------------------- | --------------- | --------- | ------------------ | ----------------------------- |
| **-O0** | Slowest | Largest | Fully safe | Debugging only |
| **-O2** | Fast | Medium | Safe | Best general-purpose choice |
| **-Ofast** | Fastest | Largest | Not IEEE-safe | Aggressive optimizations |
| **-Os** | Medium | Smallest | Safe | Size-constrained systems |
| **Hand-written ASM** | Fast | Small | Safe (fixed-point) | Optimized for embedded RV32I |