Assignment 2: RISC-V Toolchain

# Assignment 2: RISC-V Toolchain #### contributed by <`bbchen`> # [Uf8](https://hackmd.io/@bbchen/arch2025-HW1) You can find the detailed problem description and testing method in my [HW1](https://hackmd.io/@bbchen/arch2025-HW1). After building the system mode. The directory contains a comprehensive bare-metal test suite: - Makefile - uf8 encode/decode in RISC-V assembly - uf8 encode/decode function code in c - Performance counter functions for RISC-V - Startup code in assembly - Linker scripts --- ### Makefile ```makefile include ../../mk/toolchain.mk ARCH = -march=rv32i_zicsr -mabi=ilp32 LINKER_SCRIPT = linker.ld EMU ?= ../../build/rv32emu AFLAGS = -g $(ARCH) LDFLAGS = -T $(LINKER_SCRIPT) ifdef FAST CFLAGS = -g -march=rv32i_zicsr -mabi=ilp32 -Ofast else ifdef SIZE CFLAGS = -g -march=rv32i_zicsr -mabi=ilp32 -Os else CFLAGS = -g -march=rv32i_zicsr -mabi=ilp32 -O0 endif ifneq ($(DEFINES),) CFLAGS += $(DEFINES) endif EXEC = test.elf CC = $(CROSS_COMPILE)gcc AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld OBJDUMP = $(CROSS_COMPILE)objdump OBJS = start.o perfcounter.o main.o uf8.o .PHONY: all run dump clean all: $(EXEC) $(EXEC): $(OBJS) $(LINKER_SCRIPT) $(LD) $(LDFLAGS) -o $@ $(OBJS) %.o: %.S $(AS) $(AFLAGS) $< -o $@ %.o: %.c $(CC) $(CFLAGS) $< -o $@ -c run: $(EXEC) @test -f $(EMU) || (echo "Error: $(EMU) not found" && exit 1) @grep -q "ENABLE_ELF_LOADER=1" ../../build/.config || (echo "Error: ENABLE_ELF_LOADER=1 not set" && exit 1) @grep -q "ENABLE_SYSTEM=1" ../../build/.config || (echo "Error: ENABLE_SYSTEM=1 not set" && exit 1) $(EMU) $< dump: $(EXEC) $(OBJDUMP) -Ds $< | less clean: rm -f $(EXEC) $(OBJS) ``` This `Makefile` builds a `bare-metal` RV32I program . It sets the RISC-V cross-toolchain and arch flags, compiles `start.S`, `perfcounter.S`, `main.c`, and `uf8.S` into objects, then links them with `linker.ld` to produce `test.elf`. The run target sanity-checks that rv32emu exists and that ENABLE_ELF_LOADER=1 and ENABLE_SYSTEM=1 are set, then runs the ELF in the emulator. --- ### main.c :::spoiler c code ```c #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> static inline void write_str_syscall(int fd, const char *ptr, long len) { register long a0 asm("a0") = fd; register const char *a1 asm("a1") = ptr; register long a2 asm("a2") = len; register long a7 asm("a7") = 64; // system_write asm volatile("ecall" : "+r"(a0) : "r"(a1), "r"(a2), "r"(a7) : "memory", "cc"); } #define printstr(ptr, length) write_str_syscall(1, (ptr), (length)) #define TEST_OUTPUT(msg, length) printstr((msg), (length)) #define TEST_LOGGER(msg) \ do { \ static const char _s_[] = msg; \ TEST_OUTPUT(_s_, (long)(sizeof(_s_) - 1)); \ } while (0) extern uint64_t get_cycles(void); extern uint64_t get_instret(void); // /* Bare metal memcpy implementation */ void *memcpy(void *dest, const void *src, size_t n) { uint8_t *d = (uint8_t *) dest; const uint8_t *s = (const uint8_t *) src; while (n--) *d++ = *s++; return dest; } /* Software division for RV32I (no M extension) */ static unsigned long udiv(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long quotient = 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; quotient |= (1UL << i); } } return quotient; } static unsigned long umod(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; } } return remainder; } /* Software multiplication for RV32I (no M extension) */ static uint32_t umul(uint32_t a, uint32_t b) { uint32_t result = 0; while (b) { if (b & 1) result += a; a <<= 1; b >>= 1; } return result; } /* Provide __mulsi3 for GCC */ uint32_t __mulsi3(uint32_t a, uint32_t b) { return umul(a, b); } /* Simple integer to hex string conversion */ static void print_hex(unsigned long val) { char buf[20]; char *p = buf + sizeof(buf) - 1; *p = '\n'; p--; if (val == 0) { *p = '0'; p--; } else { while (val > 0) { int digit = val & 0xf; *p = (digit < 10) ? ('0' + digit) : ('a' + digit - 10); p--; val >>= 4; } } p++; printstr(p, (buf + sizeof(buf) - p)); } /* Simple integer to decimal string conversion */ static void print_dec(unsigned long val) { char buf[20]; char *p = buf + sizeof(buf) - 1; *p = '\n'; p--; if (val == 0) { *p = '0'; p--; } else { while (val > 0) { *p = '0' + umod(val, 10); p--; val = udiv(val, 10); } } p++; printstr(p, (buf + sizeof(buf) - p)); } /* ============= Declaration ============= */ typedef uint8_t uf8; extern uint32_t clz_asm(uint32_t x); extern uint32_t uf8_decode_asm(uf8 fl); extern uf8 uf8_encode_asm(uint32_t value); /* ============= Decode Encode ============= */ static inline unsigned clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return n - x; } /* Decode uf8 to uint32_t */ uint32_t uf8_decode(uf8 fl) { uint32_t mantissa = fl & 0x0f; uint8_t exponent = fl >> 4; uint32_t offset = (0x7FFF >> (15 - exponent)) << 4; return (mantissa << exponent) + offset; } /* Encode uint32_t to uf8 */ uf8 uf8_encode(uint32_t value) { /* Use CLZ for fast exponent calculation */ if (value < 16) return value; /* Find appropriate exponent using CLZ hint */ int lz = clz(value); int msb = 31 - lz; /* Start from a good initial guess */ uint8_t exponent = 0; uint32_t overflow = 0; if (msb >= 5) { /* Estimate exponent - the formula is empirical */ exponent = msb - 4; if (exponent > 15) exponent = 15; /* Calculate overflow for estimated exponent */ for (uint8_t e = 0; e < exponent; e++) overflow = (overflow << 1) + 16; /* Adjust if estimate was off */ while (exponent > 0 && value < overflow) { overflow = (overflow - 16) >> 1; exponent--; } } /* Find exact exponent */ while (exponent < 15) { uint32_t next_overflow = (overflow << 1) + 16; if (value < next_overflow) break; overflow = next_overflow; exponent++; } uint8_t mantissa = (value - overflow) >> exponent; return (exponent << 4) | mantissa; } /* ============= Test Suite ============= */ static bool test_c(void) { int32_t previous_value = -1; bool passed = true; for (int i = 0; i < 256; i++) { uint8_t fl = i; int32_t value = uf8_decode(fl); uint8_t fl2 = uf8_encode(value); if (fl != fl2) { passed = false; } if (value <= previous_value) { passed = false; } previous_value = value; } return passed; } static bool test_s(void) { int32_t previous_value = -1; bool passed = true; for (int i = 0; i < 256; i++) { uint8_t fl = i; int32_t value = uf8_decode_asm(fl); uint8_t fl2 = uf8_encode_asm(value); if (fl != fl2) { passed = false; } if (value <= previous_value) { passed = false; } previous_value = value; } return passed; } int main(void) { uint64_t start_cycles, end_cycles, cycles_elapsed; uint64_t start_instret, end_instret, instret_elapsed; TEST_LOGGER("\n=== Uf8 Tests Start ===\n\n"); /* Test : CUf8 */ TEST_LOGGER("\n=== C Tests ===\n\n"); start_cycles = get_cycles(); start_instret = get_instret(); bool success_c = test_c(); end_cycles = get_cycles(); end_instret = get_instret(); cycles_elapsed = end_cycles - start_cycles; instret_elapsed = end_instret - start_instret; TEST_LOGGER(" Cycles: "); print_dec((unsigned long) cycles_elapsed); TEST_LOGGER(" Instructions: "); print_dec((unsigned long) instret_elapsed); TEST_LOGGER("\n"); TEST_LOGGER("\n=== Assembly Tests ===\n\n"); /* Test : CUf8 */ start_cycles = get_cycles(); start_instret = get_instret(); bool success_s = test_s(); end_cycles = get_cycles(); end_instret = get_instret(); cycles_elapsed = end_cycles - start_cycles; instret_elapsed = end_instret - start_instret; TEST_LOGGER(" Cycles: "); print_dec((unsigned long) cycles_elapsed); TEST_LOGGER(" Instructions: "); print_dec((unsigned long) instret_elapsed); TEST_LOGGER("\n"); if((success_s || success_c) == false) { TEST_LOGGER("Some tests failed.\n"); return 1; } else{ TEST_LOGGER("All tests passed.\n"); } return 0; } ``` ::: 1. Basic Syscall I/O This part provide minimal output without relying on libc (printf). - `printstr()` - `TEST_LOGGER()` --- 2. Printing Utilities | Function | Description | | :--- | :--- | | `print_hex(val)` | Prints integer in hex format | | `print_dec(val)` | Prints integer in decimal format | --- 3. Performance Counters ```c extern uint64_t get_cycles(void); extern uint64_t get_instret(void); ``` Used in both C and assembly tests to measure performance. --- 4. Software Implementations (No M Extension) | Function | Purpose | | :--- | :--- | | `memcpy()` | Simple byte copy | | `udiv()` / `umod()` | Bitwise restoring division/modulus | | `umul()` / `__mulsi3()` | Shift-add multiplication | --- 5. UF8 Codec Interface ```c typedef uint8_t uf8; extern uint32_t clz_asm(uint32_t x); extern uint32_t uf8_decode_asm(uf8 fl); extern uf8 uf8_encode_asm(uint32_t value); ``` Assembly versions provided for benchmarking. --- 6. UF8 function and test suites in c Use the same method as in HW1 for testing. --- ### uf8 encode/decode in RISC-V assembly :::spoiler Assembly ```assembly .section .text .align 2 /* --------- uf8_decode --------- */ .globl uf8_decode_asm .type uf8_decode_asm, @function uf8_decode_asm: addi sp, sp, -4 sw ra, 0(sp) andi t0, a0, 0x0f #t0 = mantissa srli t1, a0, 4 #t1 = exponent # offset = ((1 << exponent) - 1) << 4 addi t2, zero, 1 sll t2, t2, t1 addi t2, t2, -1 slli t2, t2, 4 #t2 = offset sll a0,t0, t1 add a0,a0, t2 lw ra, 0(sp) # restore return addr addi sp, sp, 4 ret /* --------- uf8_encode --------- */ .globl uf8_encode_asm .type uf8_encode_asm, @function uf8_encode_asm: li t0, 16 slt t0, a0 ,t0 bne t0, zero, return_value addi sp, sp, -28 sw s0, 24(sp) sw s1, 20(sp) sw s2, 16(sp) sw s3, 12(sp) sw s4, 8(sp) sw s5, 4(sp) sw ra, 0(sp) add s0, a0, zero #s0 = value #Find appropriate exponent using CLZ hint jal ra, clz_asm add s1, a0, x0 #s1 = lz li t0, 31 sub s2, t0, s1 #s2 = msb add s3, zero, zero #s3 = exponent add s4, zero, zero #s4 = overflow li t0, 5 blt t0, s2, 1f addi s3, s2, -4 li t0, 15 ble s3, t0, 2f li s3, 15 # Calculate overflow for estimated exponent 2: li s5, 0 3: bge s5, s3, 4f slli s4, s4, 1 addi s4, s4, 16 addi s5, s5, 1 j 3b # Adjust if estimate was off 4: slt t0, x0, s3 slt t1, s0, s4 and t0, t0, t1 beq t0, x0, 1f addi s4, s4, -16 srli s4, s4, 1 addi s3, s3, -1 j 4b #Find exact exponent 1: addi t0, x0, 15 bge s3, t0, encode_done slli s6, s4, 1 addi s6, s6, 16 blt s0, s6, encode_done mv s4, s6 addi s3, s3, 1 j 1b encode_done: sub s7, s0, s4 srl s7, s7, s3 slli a0, s3, 4 or a0, a0, s7 lw ra, 0(sp) lw s5, 4(sp) lw s4, 8(sp) lw s3, 12(sp) lw s2, 16(sp) lw s1, 20(sp) lw s0, 24(sp) addi sp, sp, 28 return_value: ret /* --------- clz (count leading zeros, 0..32) --------- */ .globl clz_asm .type clz_asm, @function clz_asm: addi t0, zero, 32 #t0 = n addi t1, zero, 16 #t1 = c add t3, a0, zero #t3 = x clz_Loop: srl t2, t3, t1 #t2 = y beq t2, zero, clz_y_0 sub t0, t0, t1 add t3, t2, zero #t3 = x clz_y_0: srli t1,t1,1 # c >> 1 bne t1, x0, clz_Loop sub a0,t0,t3 ret ``` ::: Since `rv32emu` (ithis assignment) and `Ripes` ( assignment 1) are not compatible , we need to modify/correct the system calls first so that the assembly code can execute successfully on `rv32emu`. When writing assembly that will be linked with C code, we often need to mark functions as `global` (visible to the linker and other object files). > >- `.globl uf8_decode_asm` > Declares the symbol uf8_decode_asm as global. > > This allows a C file that declares. > ``` > extern uint32_t uf8_decode_asm(uint8_t); > ``` > to link correctly to this assembly function. --- ### Building the Test Suite ``` make ``` Build artifacts: - `main.o` - Test implementation - `perfcounter.o` - CSR - `start.o` - startup code with BSS initialization - `uf8.o` - uf8 encode/decode implementation - `test.elf` - linked executable ### Running test ``` make run ``` Output: ``` === Uf8 Tests Start === === C Tests === Cycles: 71316 Instructions: 71316 === Assembly Tests === Cycles: 41762 Instructions: 41762 All tests passed. ``` The UF8 test results show that both the `C` and `assembly` implementations produce identical and correct outputs, confirming functional equivalence. However, the performance measurements reveal a clear improvement in the assembly version. The C version required 71,316 cycles/instructions, while the assembly version used only 41,762, achieving roughly a 1.7× speedup. This demonstrates that manual optimization in assembly can significantly reduce instruction count and improve execution efficiency on RV32I processors lacking hardware multiplication and division. --- The compilation options above use the default `-O0` setting. I also experimented with different optimization levels, `-Ofast` (optimized for speed) and `-Os` (optimized for size). - Ofast ``` make FAST=1 ``` Output: ``` === Uf8 Tests Start === === C Tests === Cycles: 19802 Instructions: 19801 === Assembly Tests === Cycles: 63308 Instructions: 38414 All tests passed. ``` - Os ``` make SIZE=1 ``` Output: ``` === Uf8 Tests Start === === C Tests === Cycles: 32459 Instructions: 32459 All tests passed. ``` After applying different compiler optimization levels, the performance of the UF8 test program changed noticeably. When optimized with `-Ofast`, the C implementation improved dramatically to `19,802` cycles, showing that compiler optimization can effectively reduce redundant instructions. In contrast, the assembly version did not benefit from compiler optimizations and even became slower (63,308 cycles) since hand-tuned assembly is already optimized and may be affected by different inlining or alignment. Using `-Os` (size optimization) increased the C execution time (32,459 cycles) but reduced code size. :::warning When creating an assembly file, make sure the `.S` extension is capitalized. I spent half a day debugging this issue : ) ::: --- # [hanoi](https://hackmd.io/@sysprog/arch2025-quiz2-sol) This problem approaches the Tower of Hanoi using an `iterative Gray code–based` method instead of traditional recursion. The key idea is that each Gray code sequence changes by exactly one bit at a time, and this single bit flip directly indicates which disk should be moved. This strategy ensures that only one disk moves per step and that larger disks are never placed on smaller ones, fully preserving the rules of the Tower of Hanoi. --- The directory contains a comprehensive bare-metal test suite: - Makefile - hanoi in RISC-V assembly - hanoi function code in c using iterative method - Performance counter functions for RISC-V - Startup code in assembly - Linker scripts ### Makefile :::spoiler Makefile ```Makefile include ../../mk/toolchain.mk ARCH = -march=rv32izicsr LINKER_SCRIPT = linker.ld EMU ?= ../../build/rv32emu AFLAGS = -g $(ARCH) LDFLAGS = -T $(LINKER_SCRIPT) ifdef FAST CFLAGS = -g -march=rv32i_zicsr -mabi=ilp32 -Ofast else ifdef SIZE CFLAGS = -g -march=rv32i_zicsr -mabi=ilp32 -Os else CFLAGS = -g -march=rv32i_zicsr -mabi=ilp32 -O0 endif ifneq ($(DEFINES),) CFLAGS += $(DEFINES) endif EXEC = test.elf CC = $(CROSS_COMPILE)gcc AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld OBJDUMP = $(CROSS_COMPILE)objdump OBJS = start.o perfcounter.o main.o hanoi_asm.o .PHONY: all run dump clean all: $(EXEC) $(EXEC): $(OBJS) $(LINKER_SCRIPT) $(LD) $(LDFLAGS) -o $@ $(OBJS) %.o: %.S $(AS) $(AFLAGS) $< -o $@ %.o: %.c $(CC) $(CFLAGS) $< -o $@ -c run: $(EXEC) @test -f $(EMU) || (echo "Error: $(EMU) not found" && exit 1) @grep -q "ENABLE_ELF_LOADER=1" ../../build/.config || (echo "Error: ENABLE_ELF_LOADER=1 not set" && exit 1) @grep -q "ENABLE_SYSTEM=1" ../../build/.config || (echo "Error: ENABLE_SYSTEM=1 not set" && exit 1) $(EMU) $< dump: $(EXEC) $(OBJDUMP) -Ds $< | less clean: rm -f $(EXEC) $(OBJS) ``` ::: This Makefile builds and runs a bare-metal `RV32I (Zicsr)` program on rv32emu, similar to the one used in the `uf8` project, and supports different optimization levels (`-Ofast`, `-Os`, or `-O0`) during compilation. --- ### main.c :::spoiler c code ```c #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> static inline void write_str_syscall(int fd, const char *ptr, long len) { register long a0 asm("a0") = fd; register const char *a1 asm("a1") = ptr; register long a2 asm("a2") = len; register long a7 asm("a7") = 64; // system_write asm volatile("ecall" : "+r"(a0) : "r"(a1), "r"(a2), "r"(a7) : "memory", "cc"); } #define printstr(ptr, length) write_str_syscall(1, (ptr), (length)) #define TEST_OUTPUT(msg, length) printstr((msg), (length)) #define TEST_LOGGER(msg) \ do { \ static const char _s_[] = msg; \ TEST_OUTPUT(_s_, (long)(sizeof(_s_) - 1)); \ } while (0) extern uint64_t get_cycles(void); extern uint64_t get_instret(void); /* Software division for RV32I (no M extension) */ static unsigned long udiv(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long quotient = 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; quotient |= (1UL << i); } } return quotient; } static unsigned long umod(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; } } return remainder; } /* Software multiplication for RV32I (no M extension) */ static uint32_t umul(uint32_t a, uint32_t b) { uint32_t result = 0; while (b) { if (b & 1) result += a; a <<= 1; b >>= 1; } return result; } /* Provide __mulsi3 for GCC */ uint32_t __mulsi3(uint32_t a, uint32_t b) { return umul(a, b); } /* Simple integer to hex string conversion */ static void print_hex(unsigned long val) { char buf[20]; char *p = buf + sizeof(buf) - 1; *p = '\n'; p--; if (val == 0) { *p = '0'; p--; } else { while (val > 0) { int digit = val & 0xf; *p = (digit < 10) ? ('0' + digit) : ('a' + digit - 10); p--; val >>= 4; } } p++; printstr(p, (buf + sizeof(buf) - p)); } /* Simple integer to decimal string conversion */ void print_dec(unsigned long val) { char buf[20]; char *p = buf + sizeof(buf) - 1; // *p = '\n'; // p--; if (val == 0) { *p = '0'; p--; } else { while (val > 0) { *p = '0' + umod(val, 10); p--; val = udiv(val, 10); } } p++; printstr(p, (buf + sizeof(buf) - p)); } void print_char(char c) { char buf[1]; buf[0] = c; printstr(buf, 1); } /* ============= Declaration ============= */ extern uint32_t hanoi_asm(); /* ============= Hanoi ============= */ typedef struct { int v[3]; int top; } Stack; static void push(Stack *s, int x) { s->v[++s->top] = x; } static int pop (Stack *s) { return s->v[s->top--]; } static int peek(const Stack *s) { return (s->top >= 0) ? s->v[s->top] : 0; } static void move_between(Stack *a, Stack *b, char la, char lb) { int ta = peek(a), tb = peek(b); int x; const char A = 'A'; const char B = 'B'; if (ta == 0) { x = pop(b); push(a, x); } else if (tb == 0) { x = pop(a); push(b, x); } else if (ta < tb) { x = pop(a); push(b, x); } else { x = pop(b); push(a, x); } TEST_LOGGER("Move Disk "); print_dec(x); TEST_LOGGER(" from "); print_char(la); TEST_LOGGER(" to "); print_char(lb); TEST_LOGGER("\n"); } void hanoi_iter(int n) { Stack A = {.top = -1}, B = {.top = -1}, C = {.top = -1}; for (int d = n; d >= 1; --d) push(&A, d); const int total = (1 << n) - 1; int r = 0; for (int step = 0; step < total; ++step) { if (r == 0) move_between(&A, &C, 'A', 'C'); else if (r == 1) move_between(&A, &B, 'A', 'B'); else move_between(&B, &C, 'B', 'C'); ++r; if (r == 3) r = 0; } } int main(void) { uint64_t start_cycles, end_cycles, cycles_elapsed; uint64_t start_instret, end_instret, instret_elapsed; TEST_LOGGER("\n=== hanoi Tests Start ===\n\n"); TEST_LOGGER("\n=== Assembly Tests ===\n\n"); start_cycles = get_cycles(); start_instret = get_instret(); hanoi_asm(); end_cycles = get_cycles(); end_instret = get_instret(); cycles_elapsed = end_cycles - start_cycles; instret_elapsed = end_instret - start_instret; TEST_LOGGER("\n"); TEST_LOGGER("Cycles: "); print_dec((unsigned long) cycles_elapsed); TEST_LOGGER("\n"); TEST_LOGGER("Instructions: "); print_dec((unsigned long) instret_elapsed); TEST_LOGGER("\n"); TEST_LOGGER("\n=== C Tests ===\n\n"); start_cycles = get_cycles(); start_instret = get_instret(); hanoi_iter(3); end_cycles = get_cycles(); end_instret = get_instret(); cycles_elapsed = end_cycles - start_cycles; instret_elapsed = end_instret - start_instret; TEST_LOGGER("\n"); TEST_LOGGER("Cycles: "); print_dec((unsigned long) cycles_elapsed); TEST_LOGGER("\n"); TEST_LOGGER("Instructions: "); print_dec((unsigned long) instret_elapsed); TEST_LOGGER("\n"); return 0; } ``` ::: This program compares two Tower of Hanoi implementations — one in assembly (`hanoi_asm`) and one in C (`hanoi_iter`). The arithmetic and I/O parts in this program are identical to those in the previous main.c used for the uf8 project. They include the same software implementations of `multiplication`, `division`, and `modulo` operations , as well as the same system call–based `print functions`. --- ### hanoi in RISC-V assembly :::spoiler Assembly ```assembly .data str1: .string "Move Disk " str2: .string " from " str3: .string " to " strA: .string "A" strB: .string "B" strC: .string "C" str: .string "\n" .text .globl hanoi_asm .type hanoi_asm, @function hanoi_asm: addi x2, x2, -36 sw x8, 0(x2) sw x9, 4(x2) sw x18, 8(x2) sw x19, 12(x2) sw x20, 16(x2) sw ra, 32(x2) li x5, 0x15 sw x5, 20(x2) sw x5, 24(x2) sw x5, 28(x2) # Fix disk positions (BLANK 1-3: neutralize x5 effect) # BLANK 1: Fix position at x2+20 sw x0, 20(x2) # BLANK 2: Fix position at x2+24 sw x0, 24(x2) # BLANK 3: Fix position at x2+28 sw x0, 28(x2) addi x8, x0, 1 game_loop: # BLANK 4: Check loop termination (2^3 moves) addi x5, x0, 8 beq x8, x5, finish_game # Gray code formula: gray(n) = n XOR (n >> k) # BLANK 5: What is k for Gray code? srli x5, x8, 1 # BLANK 6: Complete Gray(n) calculation xor x6, x8, x5 # BLANK 7-8: Calculate previous value and its shift addi x7, x8, -1 srli x28, x7, 1 # BLANK 9: Generate Gray(n-1) xor x7, x7, x28 # BLANK 10: Which bits changed? xor x5, x6, x7 # Initialize disk number addi x9, x0, 0 # BLANK 11: Mask for testing LSB andi x6, x5, 1 # BLANK 12: Branch if disk 0 moves bne x6, x0, disk_found # BLANK 13: Set disk 1 addi x9, x0, 1 # BLANK 14: Test second bit with proper mask andi x6, x5, 2 bne x6, x0, disk_found # BLANK 15: Last disk number addi x9, x0, 2 disk_found: # BLANK 16: Check impossible pattern (multiple bits) andi x30, x5, 5 addi x31, x0, 5 beq x30, x31, pattern_match jal x0, continue_move pattern_match: continue_move: # BLANK 17: Word-align disk index (multiply by what?) slli x5, x9, 2 # BLANK 18: Base offset for disk array addi x5, x5, 20 add x5, x2, x5 lw x18, 0(x5) bne x9, x0, handle_large # BLANK 19: Small disk moves by how many positions? addi x19, x18, 2 # BLANK 20: Number of pegs addi x6, x0, 3 blt x19, x6, display_move sub x19, x19, x6 jal x0, display_move handle_large: # BLANK 21: Load reference disk position lw x6, 20(x2) # BLANK 22: Sum of all peg indices (0+1+2) addi x19, x0, 3 sub x19, x19, x18 sub x19, x19, x6 display_move: # print "Move Disk " la a0, str1 call puts # # print disk number (1..3) as one char addi a0, x9, 1 call print_dec # print " from " la a0, str2 call puts addi t0, x18, 65 mv a0, t0 call print_char print_to: # print " to " la a0, str3 call puts addi t0, x19, 65 mv a0, t0 call print_char next_move: la a0, str call puts slli x5, x9, 2 addi x5, x5, 20 add x5, x2, x5 sw x19, 0(x5) addi x8, x8, 1 jal x0, game_loop finish_game: lw x8, 0(x2) lw x9, 4(x2) lw x18, 8(x2) lw x19, 12(x2) lw x20, 16(x2) lw x1, 32(x2) addi x2, x2, 36 ret # print puts: add t0, a0, x0 add t1, a0, x0 1: lbu t2, 0(t1) beq t2, x0, 2f addi t1, t1, 1 j 1b 2: sub a2, t1, t0 mv a1, t0 li a0, 1 li a7, 64 ecall ret ``` ::: - Added `puts` subroutine : - The `puts` function scans a null-terminated string to calculate its length, then performs the `ecall` to print it. - Call `print_char` in main : - Before calling, the ASCII code of the character to be printed is placed in register `a0`and then Prints a single character. - This prints the peg name `A`, `B`, or `C`. - Call `print_dec` in main : - Prints a non-negative integer in decimal form that is placed in `a0`. --- ### Building the Test Suite ``` make ``` Build artifacts: - `main.o` - Test implementation - `perfcounter.o` - CSR - `start.o` - startup code with BSS initialization - `hanoi_asm.o` - hanoi implementation - `test.elf` - linked executable ### Running test 1. The compilation options use the default `-O0` setting. ``` make run ``` Output: ``` === hanoi Tests Start === === Assembly Tests === Move Disk 1 from A to C Move Disk 2 from A to B Move Disk 1 from C to B Move Disk 3 from A to C Move Disk 1 from B to A Move Disk 2 from B to C Move Disk 1 from A to C Cycles: 10552 Instructions: 10552 === C Tests === Move Disk 1 from A to C Move Disk 2 from A to B Move Disk 1 from B to C Move Disk 3 from A to C Move Disk 1 from A to B Move Disk 2 from B to C Move Disk 1 from A to C Cycles: 11101 Instructions: 11101 ``` 2. The compilation options use `-Ofast` ``` make clean make FAST=1 ``` Output: ``` === hanoi Tests Start === === Assembly Tests === Move Disk 1 from A to C Move Disk 2 from A to B Move Disk 1 from C to B Move Disk 3 from A to C Move Disk 1 from B to A Move Disk 2 from B to C Move Disk 1 from A to C Cycles: 4922 Instructions: 4922 === C Tests === Move Disk 1 from A to C Move Disk 2 from A to B Move Disk 1 from B to C Move Disk 3 from A to C Move Disk 1 from A to B Move Disk 2 from B to C Move Disk 1 from A to C Cycles: 4032 Instructions: 4032 ``` 3. The compilation options use `-Os` ``` make clean make SIZE=1 ``` Output: ``` === hanoi Tests Start === === Assembly Tests === Move Disk 1 from A to C Move Disk 2 from A to B Move Disk 1 from C to B Move Disk 3 from A to C Move Disk 1 from B to A Move Disk 2 from B to C Move Disk 1 from A to C Cycles: 4852 Instructions: 4852 === C Tests === Move Disk 1 from A to C Move Disk 2 from A to B Move Disk 1 from B to C Move Disk 3 from A to C Move Disk 1 from A to B Move Disk 2 from B to C Move Disk 1 from A to C Cycles: 4339 Instructions: 4339 ``` --- Comparison of three compilation options for the Assembly and C versions of the Hanoi : | Compilation Option | Assembly (cycles) | vs`-O0` | C (cycles) | vs`-O0`| Notes | |:---:|:---:|:---:|:---:|:---:|---| | `-O0` | **10,552** | — | **11,101** | — | Baseline setting | | `-Ofast` | **4,922** | **−53%** | **4,032** | **−64%** | Aggressive speed optimization; biggest gain for the C version | | `-Os` | **4,852** | **−54%** | **4,339** | **−61%** | Size-oriented optimization; asm slightly faster than `-Ofast` | From the comparison above, both the Assembly and C implementations of the Hanoi program show significant performance improvements after optimization. --- # [rsqrt](https://hackmd.io/@sysprog/arch2025-quiz3-sol) The `rsqrt` module implements a fast reciprocal square root optimized for RV32I processors. It combines a `LUT` for initial approximation and Newton-Raphson iterations for refinement, all using fixed-point arithmetic and shift-add operations. This approach can achieve around 3-8% precision error while maintaing high performance and low hardware cost. You can see more detail description of this problem [here](https://hackmd.io/@sysprog/arch2025-quiz3-sol). --- ### Implement rsqrt in C 1. clz ```clike static int clz(uint32_t x){ if(!x) return 32; int n = 0; if((x & 0xFFFF0000) == 0) { n += 16; x <<= 16; } if((x & 0xFF000000) == 0) { n += 8; x <<= 8; } if((x & 0xF0000000) == 0) { n += 4; x <<= 4; } if((x & 0xC0000000) == 0) { n += 2; x <<= 2; } if((x & 0x80000000) == 0) { n += 1; } return n; } ``` Determines the most significant bit position of the input `x`. This identifies the approximate magnitude of `x` and selects an initial estimate from the `LUT`. 2. Initial Estimate using LUT ```clike static const uint32_t rsqrt_table[32] = { 65536, 46341, 32768, 23170, 16384, 11585, 8192, 5793, 4096, 2896, 2048, 1448, 1024, 724, 512, 362, 256, 181, 128, 90, 64, 45, 32, 23, 16, 11, 8, 6, 4, 3, 2, 1 }; ``` The `rsqrt_table[32]` stores precomputed fixed-point values representing $$ \frac{65536}{\sqrt{2^{exp}}} $$ corresponding to each possible MSB position (`exp`). 3. Linear Interpolation ```clike if (x > (1u << exp)) { uint32_t y_next = (exp < 31) ? rsqrt_table[exp + 1] : 0; uint32_t delta = y - y_next; // numer = ((uint32_t)(x - (1u<<exp)) << 16) uint32_t t = x - (1u << exp); uint32_t numer_hi = t >> 16; // high 32 uint32_t numer_lo = t << 16; // low 32 uint32_t frac = shr64_lo32(numer_hi, numer_lo, exp); uint32_t p_hi, p_lo; mul32_lohi(delta, frac, &p_hi, &p_lo); uint32_t prod_shr16 = shr64_lo32(p_hi, p_lo, 16); y = y - prod_shr16; } ``` For input between power of two $(2^{exp} < x < 2^{exp+1})$ ,the code linearly interpolates between adjacent table values (`y` and `y_next`) to refine the initial estimate: $$ y = y_{base} - (y_{base} - y_{next})*fraction $$ 4. Newton-Raphson Iteration ```clike for(int iter = 0; iter < 2; iter++) { uint64_t y_sq = mul32(y, y); uint32_t y2 = (uint32_t)(y_sq >> 16); uint64_t xy2 = (uint64_t)mul32(x, y2); uint64_t factor_num = (3u << 16); factor_num = (xy2 >= factor_num) ? 0 : (factor_num - xy2); uint32_t factor = (uint32_t)(factor_num >> 1); y = (uint32_t)((mul32(y, factor)) >> 16); } ``` Applies two fixed-point refinement steps using the formula: $$y_{n+1} = y_n \times \frac{3 - x⋅y_n^2/2^{16}}{2} $$ implemented entirely with 32-bits arithemetic. In this implementation, I decided to perform `two` iteration, which provides a good balance between accuracy and computational cost. And finally do the fixed-point scaling because values are scaled by $2^{16}$ , shifts (>>16) compensate for scalling after each multiplication. --- ### Analyzing precision and performance :::spoiler Test code ```c void test_rsqrt(void) { const uint32_t test_vals[] = { 1, 2, 3, 4, 5, 8, 10, 16, 31, 32, 64, 100, 255, 256, 512, 1000, 4096, 65536, 100000 }; const uint32_t ref_vals[] = { 65536, 46341, 37837, 32768, 29309, 23170, 20724, 16384, 11771, 11585, 8192, 6554, 4104, 4096, 2896, 2072, 1024, 256, 207 }; const int N = sizeof(test_vals)/sizeof(test_vals[0]); uint64_t total_abs_err = 0; uint64_t total_rel_bp = 0; uint32_t worst_x_abs = 0, worst_abs = 0; uint32_t worst_rel_bp = 0; TEST_LOGGER("\n--- rsqrt Test (integer) ---\n"); for (int i = 0; i < N; i++) { uint32_t x = test_vals[i]; uint32_t y = rsqrt(x); uint32_t ref = ref_vals[i]; uint32_t abs_err = (y >= ref) ? (y - ref) : (ref - y); uint32_t rel_bp = udiv(abs_err * 10000u + (ref >> 1), ref); total_abs_err += abs_err; total_rel_bp += rel_bp; if (abs_err > worst_abs) { worst_abs = abs_err; worst_x_abs = x; } if (rel_bp > worst_rel_bp){ worst_rel_bp = rel_bp; } TEST_LOGGER("x="); print_dec(x); TEST_LOGGER(" rsqrt="); print_dec(y); TEST_LOGGER(" ref="); print_dec(ref); TEST_LOGGER(" abs="); print_dec(abs_err); TEST_LOGGER(" rel="); print_dec(udiv(rel_bp , 100)); TEST_LOGGER("."); print_dec(umod(rel_bp , 100)); TEST_LOGGER("\n"); } uint32_t avg_abs = (uint32_t)(udiv(total_abs_err , N)); uint32_t avg_bp = (uint32_t)(udiv(total_rel_bp , N)); TEST_LOGGER("\n--- Summary ---\n"); TEST_LOGGER("Avg abs err: "); print_dec(avg_abs); TEST_LOGGER("\nAvg rel err: "); print_dec(udiv(avg_bp , 100)); TEST_LOGGER("."); print_dec(umod(avg_bp , 100)); TEST_LOGGER("\n"); TEST_LOGGER("Worst abs err: "); print_dec(worst_abs); TEST_LOGGER(" (x="); print_dec(worst_x_abs); TEST_LOGGER(")\n"); TEST_LOGGER("Worst rel err: "); print_dec(udiv(worst_rel_bp , 100)); TEST_LOGGER("."); print_dec(umod(worst_rel_bp , 100)); TEST_LOGGER("\n"); } ``` ::: To verify the correctness and precision of the `rsqrt()` function, I implment a dedicated test routine `test_rsqrt()`. The function compares the computed reciprocal square root against reference fixed-point values for a set of representative input integers. --- #### Test set A list of 19 integer inputs (`test_vals[]`) was selected, ranging from small values(1-10) to larger ones (up to 100,000). Each input has a reference result(`ref_vals[]`) corresponding to the ideal fixed-point value of $\frac{1}{\sqrt{x}}\times2^{16}$. --- #### Test loop For each input `x` - The function computes `y = rsqrt(x)` using the implemented algorithm. - The absolute error is calculated as $$ {abs\_err} = | y - ref | $$ - The relative error is calculated as $$rel\_bp = \frac{|y-ref|}{ref}\times10000 $$ --- #### Error Accumulation - `total_abs_error` and `total_rel_bp` accumlate errors over all test cases. - The test also tracks both the average and worst-case error. --- #### Test result ``` === rsqrt Test Start === --- rsqrt Test (integer) --- x=1 rsqrt=65536 ref=65536 abs=0 rel=0.0 x=2 rsqrt=46341 ref=46341 abs=0 rel=0.0 x=3 rsqrt=37837 ref=37837 abs=0 rel=0.0 x=4 rsqrt=32768 ref=32768 abs=0 rel=0.0 x=5 rsqrt=29309 ref=29309 abs=0 rel=0.0 x=8 rsqrt=23171 ref=23170 abs=1 rel=0.0 x=10 rsqrt=20724 ref=20724 abs=0 rel=0.0 x=16 rsqrt=16384 ref=16384 abs=0 rel=0.0 x=31 rsqrt=11771 ref=11771 abs=0 rel=0.0 x=32 rsqrt=11587 ref=11585 abs=2 rel=0.2 x=64 rsqrt=8192 ref=8192 abs=0 rel=0.0 x=100 rsqrt=6557 ref=6554 abs=3 rel=0.5 x=255 rsqrt=4110 ref=4104 abs=6 rel=0.15 x=256 rsqrt=4096 ref=4096 abs=0 rel=0.0 x=512 rsqrt=2907 ref=2896 abs=11 rel=0.38 x=1000 rsqrt=2072 ref=2072 abs=0 rel=0.0 x=4096 rsqrt=1024 ref=1024 abs=0 rel=0.0 x=65536 rsqrt=256 ref=256 abs=0 rel=0.0 x=100000 rsqrt=239 ref=207 abs=32 rel=15.46 --- Summary --- Avg abs err: 2 Avg rel err: 0.84 Worst abs err: 32 (x=100000) Worst rel err: 15.46 Cycles: 422239 Instret: 422239 ``` The test result show that the implemented `rsqrt()` function performs accurately for most inputs, with only minor deviations for large values. The **average absolute error** is 2, and the **average relative error** is only **0.84%**, confirming excellent overall precision. For conclusion, the fixed-point `rsqrt()` achieves a strong balance between speed and precision, making it -well-suited for application without floating-point hardware. --- # [Fibonacci sequence](https://en.wikipedia.org/wiki/Fibonacci_sequence) To evaluate performance, I tested three implementations of the Fibonacci function: the recursive C version, the iterative C version, and my handwritten RISC-V assembly version. ### Recursive Version `fib(n)` Based directly on the mathematical definition : $$ F(n) = F(n-1) + F(n-2),\ F(0) = 0, F(1) =1 $$ The function call itself twice for each `n > 1` and sums the result. ```c static uint64_t fib(uint64_t n) { if (n <= 1) return n; return fib(n - 1) + fib(n - 2); } ``` --- ### Iterative Version `fib_iter_c(n)` Computes Fibonacci numbers `bottom-up`, keeping only the last two values in memory. ```clike volatile uint64_t fib_iter_c(uint64_t n) { if (n <= 1) return n; volatile uint64_t f0 = 0, f1 = 1; for (uint64_t i = 1; i < n; ++i) { volatile uint64_t f2 = f0 + f1; f0 = f1; f1 = f2; } return f1; } ``` 1. Initialize `f0 = 0`, `f1 = 1` 2. For each iteration, compute `f2 = f0 + f1`, then shift `f0 = f1`, `f1 = f2`. 3. After the loop, `f1` holds $F(n)$. --- ### Handwritten assembly I implement the iterative version directly in RISC-V assembly. :::spoiler Assembly ``` .globl fib_iter .type fib_iter, @function .text fib_iter: # if (n == 0) return 0; beqz a0, .Lret0 # if (n == 1) return 1; addi t0, a0, -1 beqz t0, .Lret1 li t1, 0 # f0 li t2, 1 # f1 .Lloop: add t3, t1, t2 # f2 = f0 + f1 mv t1, t2 # f0 = f1 mv t2, t3 # f1 = f2 addi t0, t0, -1 bnez t0, .Lloop mv a0, t2 li a1, 0 ret .Lret0: li a0, 0 li a1, 0 ret .Lret1: li a0, 1 li a1, 0 ret ``` ::: --- ### Test result To compare efficiency, all version computed `fib(19)` under identical condition. I used `getticks()` function to measure the cycle count. ```c Recursion (C): elapsed cycle: 133630 Iteration (C): elapsed cycle: 462 Assembly: elapsed cycle: 108 ``` - The recursive version is the least efficient, as each call creates two new recursive branches, leading to exponential growth in execution time. - The iterative C version significantly reduces complexity to linear time by reusing results in each loop iteration. - The assembly version achieves the best performance because it removes compiler overhead (stack frame setup, variable access) and directly manipulates registers, minimizing instruction count and memory access. The result clearly demonstrate that: Assembly > Iterative C > Recursive C in both speed and instruction efficiency.