hw2 - HackMD

# Assignment 2: RISC-V Bare-Metal Optimization ## 1. Project Overview and Bare-Metal Setup This project implemented and optimized applications for the RV32I ISA using the ``rv32emu`` bare-metal environment. All programs rely on custom assembly (``start.S``, ``perfcounter.S``) for startup, memory mapping, and I/O (Syscall 64). All C baselines were compiled with -O0 to isolate the performance impact of soft-implemented arithmetic operations. ### Makefile ``` # ----------------------------------------------------------------------------- # 1. ENV & TOOLCHAIN CONFIGURATION # ----------------------------------------------------------------------------- include ../../../mk/toolchain.mk LINKER_SCRIPT = linker.ld EMU ?= ../../../build/rv32emu # RISC-V RV32I Restriction (No M/F extensions) ARCH = -march=rv32izicsr # Linking flags: Disable standard libraries and use custom linker script LDFLAGS = -nostdlib -nostartfiles -T $(LINKER_SCRIPT) CC = $(CROSS_COMPILE)gcc AS = $(CROSS_COMPILE)as OBJDUMP = $(CROSS_COMPILE)objdump # CFLAGS definitions COMMON_CFLAGS = -g -march=rv32i_zicsr -mabi=ilp32 -Wall -Werror -ffreestanding -std=c99 CFLAGS_O2 = $(COMMON_CFLAGS) -O2 CFLAGS_Os = $(COMMON_CFLAGS) -Os CFLAGS_O0 = $(COMMON_CFLAGS) -O0 AFLAGS = -g $(ARCH) # ----------------------------------------------------------------------------- # 2. PROJECT & SHARED FILES # ----------------------------------------------------------------------------- # Shared bare-metal objects: Startup and performance counter COMMON_ASRCS = start.S perfcounter.S COMMON_OBJS = $(COMMON_ASRCS:.S=.o) # C Baseline ELF targets PROJECTS = q2a_hanoi q3c_sine hw1_remove ELFS = $(addsuffix .elf, $(PROJECTS)) # Optimized/Experiment ELF targets Q2A_ASM_ELF = q2a_hanoi_asm.elf Q3C_ASM_ELF = q3c_sine_asm.elf Q3C_OS_ELF = q3c_sine_os.elf Q2A_PERF_ELF = q2a_hanoi_perf.elf # C Object Files Q2A_C_OBJ = q2a_hanoi.o Q3C_C_OBJ = q3c_sine.o HW1_C_OBJ = hw1_remove.o Q2A_PERF_C_OBJ = q2a_hanoi_perf.o Q2A_PERF_C_SRC = q2a_hanoi_perf.c # <-- Source file definition # Assembly Object Files Q2A_ASM_OBJ = q2a_hanoi_asm.o Q3C_ASM_OBJ = q3c_sine_asm.o # C Objects compiled with macros (to hide C implementations for ASM linking) Q2A_C_OBJ_FOR_ASM = q2a_hanoi_for_asm.o Q3C_C_OBJ_FOR_ASM = q3c_sine_for_asm.o # ----------------------------------------------------------------------------- # 3. RULES AND TARGETS # ----------------------------------------------------------------------------- .PHONY: all run_q2a_c run_q2a_asm run_q3c_c run_q3c_os run_hw1_c run_q2a_perf_c clean dump size_check all: $(ELFS) $(Q2A_ASM_ELF) $(Q3C_ASM_ELF) $(Q3C_OS_ELF) $(Q2A_PERF_ELF) # --- A. Compilation Rules --- # Rule: Compile C Baselines (O0) $(Q2A_C_OBJ): q2a_hanoi.c $(CC) $(CFLAGS_O0) -c $< -o $@ $(Q3C_C_OBJ): q3c_sine.c $(CC) $(CFLAGS_O0) -c $< -o $@ $(HW1_C_OBJ): hw1_remove.c $(CC) $(CFLAGS_O0) -c $< -o $@ # Rule: Compile Q2A Pure Arithmetic C File (O0) - FIXED $(Q2A_PERF_C_OBJ): $(Q2A_PERF_C_SRC) $(CC) $(CFLAGS_O0) -c $< -o $@ # Rule: Compile Q3C C Experiment (Os) q3c_sine_os.o: q3c_sine.c $(CC) $(CFLAGS_Os) -c $< -o $@ # Rule: Compile C Object for ASM Linking (Hides C logic via macro) $(Q2A_C_OBJ_FOR_ASM): q2a_hanoi.c $(CC) $(CFLAGS_O2) -DTEST_ASM_OPTIMIZED_Q2A -c $< -o $@ $(Q3C_C_OBJ_FOR_ASM): q3c_sine.c $(CC) $(CFLAGS_O2) -DTEST_ASM_OPTIMIZED_Q3C -c $< -o $@ # Rule: Compile Assembly Files (.S) $(Q2A_ASM_OBJ): q2a_hanoi.S $(CC) $(AFLAGS) -c $< -o $@ $(Q3C_ASM_OBJ): q3c_sine.S $(CC) $(AFLAGS) -c $< -o $@ # Rule: Compile Shared Assembly Objects %.o: %.S $(CC) $(AFLAGS) -c $< -o $@ # --- B. Linking Rules (No changes needed) --- # Rule: Link C Baseline ELFs q2a_hanoi.elf: $(Q2A_C_OBJ) $(COMMON_OBJS) $(CC) $(LDFLAGS) -o $@ $^ q3c_sine.elf: $(Q3C_C_OBJ) $(COMMON_OBJS) $(CC) $(LDFLAGS) -o $@ $^ hw1_remove.elf: $(HW1_C_OBJ) $(COMMON_OBJS) $(CC) $(LDFLAGS) -o $@ $^ # Rule: Link Q2A Pure Arithmetic ELF $(Q2A_PERF_ELF): $(Q2A_PERF_C_OBJ) $(COMMON_OBJS) $(CC) $(LDFLAGS) -o $@ $^ # Rule: Link Q3C Os Experiment ELF $(Q3C_OS_ELF): q3c_sine_os.o $(COMMON_OBJS) $(CC) $(LDFLAGS) -o $@ $^ # Rule: Link Q2A Assembly Optimized ELF $(Q2A_ASM_ELF): $(Q2A_ASM_OBJ) $(Q2A_C_OBJ_FOR_ASM) $(COMMON_OBJS) @echo "--- Linking Q2A Assembly Version ---" $(CC) $(LDFLAGS) -o $@ $^ # Rule: Link Q3C Assembly Optimized ELF $(Q3C_ASM_ELF): $(Q3C_ASM_OBJ) $(Q3C_C_OBJ_FOR_ASM) $(COMMON_OBJS) @echo "--- Linking Q3C Assembly Version ---" $(CC) $(LDFLAGS) -o $@ $^ # --- C. Run Targets (Use -t -p manually for cycle measurement) --- # Run HW1 C Baseline run_hw1_c: hw1_remove.elf @echo "--- Running HW1 (Remove Element) C Baseline (O0) ---" $(EMU) $< # Run Q2A C Baseline (with I/O) run_q2a_c: q2a_hanoi.elf @echo "--- Running Q2A (Hanoi) C Baseline (O0, with I/O) ---" $(EMU) $< # Run Q2A Assembly Optimized run_q2a_asm: $(Q2A_ASM_ELF) @echo "--- Running Q2A (Hanoi) Assembly Optimized ---" $(EMU) $< # Run Q2A C Pure Arithmetic Baseline (O0) run_q2a_perf_c: $(Q2A_PERF_ELF) @echo "--- Running Q2A (Hanoi) C Baseline (O0, PURE ARITHMETIC) ---" $(EMU) $< # Run Q3C C Baseline (O0) run_q3c_c: q3c_sine.elf @echo "--- Running Q3C (Fast RSQRT) C Baseline (O0) ---" $(EMU) $< # Run Q3C C Experiment (Os) run_q3c_os: $(Q3C_OS_ELF) @echo "--- Running Q3C (Fast RSQRT) C Experiment (Os) ---" $(EMU) $< # Run Q3C Assembly Optimized run_q3c_asm: $(Q3C_ASM_ELF) @echo "--- Running Q3C (Fast RSQRT) Assembly Optimized ---" $(EMU) $< # --- D. Disassemble & Clean --- dump: $(ELFS) $(Q3C_OS_ELF) $(Q2A_ASM_ELF) $(Q3C_ASM_ELF) @echo "--- Dumping Assembly for Analysis (All Versions) ---" @for elf in $(ELFS) $(Q3C_OS_ELF) $(Q2A_ASM_ELF) $(Q3C_ASM_ELF); do \ echo "Dumping $$elf assembly..."; \ $(OBJDUMP) -D $$elf > $$elf.objdump.txt; \ done clean: rm -f $(ELFS) $(Q2A_ASM_ELF) $(Q3C_ASM_ELF) $(Q3C_OS_ELF) $(Q2A_PERF_ELF) $(COMMON_OBJS) $(addsuffix .o, $(PROJECTS)) $(Q2A_ASM_OBJ) $(Q3C_ASM_OBJ) $(Q2A_C_OBJ_FOR_ASM) $(Q3C_C_OBJ_FOR_ASM) $(Q2A_PERF_C_OBJ) rm -f *.json *.objdump.txt ``` ## 2. Homework 1 Program Adaptation The "Remove Element" (LeetCode 27) algorithm was adapted to run in the bare-metal environment using custom I/O functions. This demonstrates successful compilation, linkage, and execution of a foundational array manipulation task in the constrained environment. ### hw1_remove.c ``` #include <stdbool.h> #include <stdint.h> #include <string.h> #include <stddef.h> // I/O Macros (Bare-Metal Syscall 64) #define printstr(ptr, length) \ do { \ __asm__ volatile( \ "add a7, x0, 0x40;" /* Syscall ID: 64 (write) */ \ "add a0, x0, 0x1;" /* File Descriptor: 1 (stdout) */ \ "add a1, x0, %0;" \ "mv a2, %1;" /* Length */ \ "ecall;" \ : \ : "r"(ptr), "r"(length) \ : "a0", "a1", "a2", "a7", "memory");\ } while (0) #define TEST_LOGGER(msg) \ { \ char _msg[] = msg; \ printstr(_msg, sizeof(_msg) - 1); \ } // External Functions (implemented in other files) extern uint64_t get_cycles(void); extern uint64_t get_instret(void); // --- BARE-METAL C IMPLEMENTATIONS (Required by program logic) --- // [NOTE: All required bare-metal math helpers (umod, udiv, putc, print_dec, memcpy) // should be included here, mirroring q3c_sine.c's structure.] // --- C IMPLEMENTATIONS OF HELPERS (Always Compiled) --- static unsigned long udiv(unsigned long dividend, unsigned long divisor){ if (divisor == 0) return 0; unsigned long quotient = 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; quotient |= (1UL << i); } } return quotient; } static unsigned long umod(unsigned long dividend, unsigned long divisor){ if (divisor == 0) return 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; } } return remainder; } static void putc(char c) { char temp_buf[1] = {c}; printstr(temp_buf, 1); } void print_dec(unsigned long val){ char buf[20]; char *p = buf + sizeof(buf); *(--p) = '\0'; if (val == 0) { *(--p) = '0'; } else { while (val > 0) { *(--p) = '0' + umod(val, 10); val = udiv(val, 10); } } printstr(p, (buf + sizeof(buf) - 1 - p)); putc('\n'); } // --- HW1 REMOVE ELEMENT LOGIC --- int removeElement(int* nums, int numsSize, int val) { int k = 0; // k is the slow pointer (index to write non-val elements) for (int i = 0; i < numsSize; i++) { if (nums[i] != val) { // Write non-val element to the k position nums[k] = nums[i]; k++; } } return k; // New length of the array } // --- MAIN Function (Test Framework) --- int main(void) { uint64_t start_cycles, end_cycles, cycles_elapsed; // Test Case: nums = [0,1,2,2,3,0,4,2], val = 2 int nums[] = {0, 1, 2, 2, 3, 0, 4, 2}; int numsSize = 8; int val = 2; TEST_LOGGER("\n=== HW1 Remove Element Test (C Baseline) ===\n"); TEST_LOGGER("Array Size: "); print_dec(numsSize); TEST_LOGGER("Value to Remove: "); print_dec(val); start_cycles = get_cycles(); int newLength = removeElement(nums, numsSize, val); end_cycles = get_cycles(); cycles_elapsed = end_cycles - start_cycles; TEST_LOGGER("New Length: "); print_dec(newLength); TEST_LOGGER("Verification Output:\nResult Array: ["); // Verify results by printing the remaining elements for(int i = 0; i < newLength; i++) { print_dec(nums[i]); if (i < newLength - 1) TEST_LOGGER(", "); } TEST_LOGGER("]\n"); TEST_LOGGER("HW1 CSR Cycles: "); print_dec((unsigned long) cycles_elapsed); TEST_LOGGER("=== Test Completed ===\n"); return 0; } ``` ### HW1 Remove Element C Baseline Performance | Program | Compilation Flag | Verification Output | Cycles (CSR Cycles) | | -------- | -------- | -------- | -------- | | HW1 Remove Element | -O0 | Length: 5, Array: $\{0, 1, 3, 0, 4\}$ |202 | ![image](https://hackmd.io/_uploads/rytoHgGlbx.png) ## 3. Problem A (Hanoi) Optimization This task focuses on measuring and eliminating the performance overhead of the soft-implemented modulo operation (``umod``) within the Tower of Hanoi iterative solution. ### q2a_hanoi.c ``` #include <stdbool.h> #include <stdint.h> #include <string.h> #include <stddef.h> // ======================================================= // 1. BARE-METAL I/O AND HELPER FUNCTIONS // ======================================================= // I/O Macro: Uses __asm__ volatile for syscall 64 (write) #define printstr(ptr, length) \ do { \ __asm__ volatile( \ "add a7, x0, 0x40;" /* Syscall ID: 64 (write) */ \ "add a0, x0, 0x1;" /* File Descriptor: 1 (stdout) */ \ "add a1, x0, %0;" \ "mv a2, %1;" /* Length */ \ "ecall;" \ : \ : "r"(ptr), "r"(length) \ : "a0", "a1", "a2", "a7", "memory");\ } while (0) #define TEST_LOGGER(msg) \ { \ char _msg[] = msg; \ printstr(_msg, sizeof(_msg) - 1); \ } // External functions implemented in Assembly (.S) extern uint64_t get_cycles(void); extern uint64_t get_instret(void); // RV32I Math Helpers (Software implementation of division/modulo) static unsigned long udiv(unsigned long dividend, unsigned long divisor); static unsigned long umod(unsigned long dividend, unsigned long divisor); // I/O Helpers static void putc(char c); void print_dec(unsigned long val); // --- C IMPLEMENTATIONS OF HELPERS (Always Compiled) --- static unsigned long udiv(unsigned long dividend, unsigned long divisor){ if (divisor == 0) return 0; unsigned long quotient = 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; quotient |= (1UL << i); } } return quotient; } static unsigned long umod(unsigned long dividend, unsigned long divisor){ if (divisor == 0) return 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; } } return remainder; } static void putc(char c) { char temp_buf[1] = {c}; printstr(temp_buf, 1); } void print_dec(unsigned long val){ char buf[20]; char *p = buf + sizeof(buf); *(--p) = '\0'; if (val == 0) { *(--p) = '0'; } else { while (val > 0) { *(--p) = '0' + umod(val, 10); val = udiv(val, 10); } } printstr(p, (buf + sizeof(buf) - 1 - p)); putc('\n'); } // ======================================================= // 2. Q2A HANOI CORE LOGIC (Conditional Compilation) // ======================================================= static inline uint32_t gray(uint32_t n) { return n ^ (n >> 1); } #ifndef TEST_ASM_OPTIMIZED_Q2A void __attribute__((used)) tower_of_hanoi_iterative(int n){ int pegs[3] = {0, 0, 0}; int num_moves = (1 << n); const char *msg_move = "Move Disk "; const char *msg_from = " from "; const char *msg_to = " to "; const char *msg_nl = "\n"; for (int step = 1; step < num_moves; ++step) { uint32_t g_curr = gray(step); uint32_t g_prev = gray(step - 1); uint32_t diff = g_curr ^ g_prev; int disk = 0; while ((diff >>= 1)) disk++; int from, to; if (disk == 0) { from = pegs[disk]; to = (int)umod((unsigned long)from + 1, 3UL); } else { from = pegs[disk]; int sum = 0 + 1 + 2; to = sum - from - pegs[0]; } printstr((char *)msg_move, 10); putc('A' + disk); printstr((char *)msg_from, 6); putc('A' + from); printstr((char *)msg_to, 4); putc('A' + to); printstr((char *)msg_nl, 1); pegs[disk] = to; } } #else // Extern Declaration (Compiled when linking the Assembly optimized version) extern void tower_of_hanoi_iterative(int n); #endif int main(void) { uint64_t start_cycles, end_cycles, cycles_elapsed; TEST_LOGGER("\n=== Q2A Hanoi Test (C Baseline) ===\n"); start_cycles = get_cycles(); int n = 3; tower_of_hanoi_iterative(n); end_cycles = get_cycles(); cycles_elapsed = end_cycles - start_cycles; TEST_LOGGER(" Hanoi complete.\n"); TEST_LOGGER(" CSR Cycles: "); print_dec((unsigned long) cycles_elapsed); return 0; } ``` ### q2a_hanoi.S ``` # q2a_hanoi.S - 漢諾塔迭代解法 (優化版本) # 核心優化：內聯 (from + 1) % 3 邏輯 # Syscall 修正：移除不兼容的 I/O Syscalls .section .text .globl tower_of_hanoi_iterative .type tower_of_hanoi_iterative, @function # 函式簽名: void tower_of_hanoi_iterative(int n) # 參數: a0 = n (圓盤數) tower_of_hanoi_iterative: # 儲存 callee-saved 暫存器 (s0-s2 用於迴圈變數和 pegs 陣列基址) addi sp, sp, -16 sw ra, 12(sp) sw s0, 8(sp) sw s1, 4(sp) sw s2, 0(sp) # 初始化： # s0: pegs[3] 陣列的基址 (base address，在堆疊上分配 12 bytes) addi sp, sp, -12 mv s0, sp # s0 = pegs 陣列基址 sw zero, 0(s0) # pegs[0] = 0 sw zero, 4(s0) # pegs[1] = 0 sw zero, 8(s0) # pegs[2] = 0 # s1: num_moves = 1 << n li s1, 1 sll s1, s1, a0 # s1 = 1 << n (總步數 + 1) # s2: step (迴圈變數) li s2, 1 # s2 = step = 1 # 定義常數 li t0, 3 # t0 = 3 (用於優化後的 % 3 運算) li t2, 3 # t2 = sum = 3 (0+1+2) TOWER_LOOP: # 判斷迴圈是否結束: if (step >= num_moves) break; bge s2, s1, TOWER_END # 1. 計算 Gray Code: g_curr, g_prev, diff mv t3, s2 # t3 = step srli t4, t3, 1 xor t3, t3, t4 # t3 = g_curr addi t5, s2, -1 # t5 = step - 1 srli t6, t5, 1 xor t5, t5, t6 # t5 = g_prev xor t6, t3, t5 # t6 = diff # 2. 計算 disk 編號 (a1) li a1, 0 # a1 = disk = 0 FIND_DISK_LOOP: srli t6, t6, 1 beq t6, zero, DISK_FOUND addi a1, a1, 1 j FIND_DISK_LOOP DISK_FOUND: # 3. 計算 from = pegs[disk] slli t3, a1, 2 # t3 = disk * 4 add t3, s0, t3 # t3 = address of pegs[disk] lw a2, 0(t3) # a2 = from = pegs[disk] # 4. IF/ELSE 判斷: if (disk == 0) bne a1, zero, DISK_NON_ZERO # --- DISK == 0 邏輯 (to = (from + 1) % 3) --- addi a3, a2, 1 # a3 = from + 1 # **優化: 內聯 (from + 1) % 3 (取代了對 umod 的函式呼叫)** blt a3, t0, CALC_TO_DONE sub a3, a3, t0 j CALC_TO_DONE DISK_NON_ZERO: # --- DISK != 0 邏輯 (to = sum - from - pegs[0]) --- lw t4, 0(s0) # t4 = pegs[0] sub a3, t2, a2 # a3 = sum - from sub a3, a3, t4 # a3 = to = sum - from - pegs[0] CALC_TO_DONE: # a2 (from) 和 a3 (to) 已經計算完成 # ------------------------------------------------ # **移除 I/O 程式碼** (將 I/O 留給 C 語言的 TEST_LOGGER 呼叫) # ------------------------------------------------ # 5. 更新狀態: pegs[disk] = to slli t3, a1, 2 # t3 = disk * 4 add t3, s0, t3 # t3 = address of pegs[disk] sw a3, 0(t3) # pegs[disk] = to # 6. 遞增 step++ addi s2, s2, 1 # step++ j TOWER_LOOP # 重複迴圈 TOWER_END: # 清理堆疊 addi sp, sp, 12 # 釋放 pegs[3] 的空間 # 恢復 callee-saved 暫存器 lw s2, 0(sp) lw s1, 4(sp) lw s0, 8(sp) lw ra, 12(sp) addi sp, sp, 16 ret # ------------------------------------------------ # 移除所有 I/O 相關的 .data 和 ecall 指令 # ------------------------------------------------ ``` ![image](https://hackmd.io/_uploads/r1aeUlMx-x.png) ![image](https://hackmd.io/_uploads/Syh-LxMlZl.png) ![image](https://hackmd.io/_uploads/rkyXLgzxWe.png) ### Q2A Performance Comparison: Function Call Elimination | Version | Implementation | Cycles (CSR Cycles) |Time Reduction|Key Optimization| | -------- | -------- | -------- | ---- | ---- | | C Baseline (Full I/O) | C / GCC -O0 | 3742 |0% (Initial)|Includes costly I/O Syscalls (Unfair Baseline).| |C Baseline (Pure Arith)|C / GCC -O0 (No I/O)|2988|20.2% (I/O Removal)|Fair Baseline: Cost of soft arithmetic only.| |Assembly Optimized|Hand-written .S|207|93.1% (vs. 2988)|Inline modulo logic (conditional subtraction).| ### Observation and Explanation The C baseline, compiled with ``-O0``, exposed the true cost of software arithmetic. The 93.1% cycle reduction was achieved by eliminating the function call overhead associated with umod. The C version spent the majority of its 2,988 pure arithmetic cycles executing the body of the large umod routine. Our assembly solution replaced this with a highly efficient, 3-instruction peephole optimization. ## 4. Problem C (Fast RSQRT) Analysis This task analyzes the cost of complex soft-implemented mathematics (64-bit multiplication) when hardware acceleration (RV32M/F) is disabled. ### q3c_sine.c ``` #include <stdbool.h> #include <stdint.h> #include <string.h> #include <stddef.h> // I/O Macros (Bare-Metal Syscall 64) #define printstr(ptr, length) \ do { \ __asm__ volatile( \ "add a7, x0, 0x40;" /* Syscall ID: 64 (write) */ \ "add a0, x0, 0x1;" /* File Descriptor: 1 (stdout) */ \ "add a1, x0, %0;" \ "mv a2, %1;" /* Length */ \ "ecall;" \ : \ : "r"(ptr), "r"(length) \ : "a0", "a1", "a2", "a7", "memory");\ } while (0) #define TEST_LOGGER(msg) \ { \ char _msg[] = msg; \ printstr(_msg, sizeof(_msg) - 1); \ } // External Functions (implemented in other files) extern uint64_t get_cycles(void); extern uint64_t get_instret(void); // --- BARE-METAL C IMPLEMENTATIONS (Always compiled) --- void *memcpy(void *dest, const void *src, size_t n) { uint8_t *d = (uint8_t *) dest; const uint8_t *s = (const uint8_t *) src; while (n--) { *d++ = *s++; } return dest; } // Low-level arithmetic helpers (Always compiled in C) static uint32_t umul(uint32_t a, uint32_t b){ uint32_t result = 0; while (b) { if (b & 1) result += a; a <<= 1; b >>= 1; } return result; } uint32_t __mulsi3(uint32_t a, uint32_t b) { return umul(a, b); } uint64_t __muldi3(uint64_t a, uint64_t b) { uint64_t result = 0; while (b) { if (b & 1) result += a; a <<= 1; b >>= 1; } return result; } unsigned long udiv(unsigned long dividend, unsigned long divisor){ if (divisor == 0) return 0; unsigned long quotient = 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; quotient |= (1UL << i); } } return quotient; } unsigned long umod(unsigned long dividend, unsigned long divisor){ if (divisor == 0) return 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; } } return remainder; } void putc(char c) { char temp_buf[1] = {c}; printstr(temp_buf, 1); } void print_dec(unsigned long val){ char buf[20]; char *p = buf + sizeof(buf); *(--p) = '\0'; if (val == 0) { *(--p) = '0'; } else { while (val > 0) { *(--p) = '0' + umod(val, 10); val = udiv(val, 10); } } printstr(p, (buf + sizeof(buf) - 1 - p)); putc('\n'); } // --- Q3C CORE LOGIC (Macro Control) --- // Original C implementations remain here static void newton_step(uint32_t *rec_inv_sqrt, uint32_t x) { uint32_t invsqrt, invsqrt2; uint64_t val; invsqrt = *rec_inv_sqrt; invsqrt2 = __muldi3(invsqrt, invsqrt) >> 32; val = (3ULL << 32) - __muldi3(x, invsqrt2); val >>= 2; val = __muldi3(val, invsqrt) >> 31; *rec_inv_sqrt = (uint32_t)val; } static const uint32_t rsqrt_table[32] = { 65536, 46341, 32768, 23170, 16384, 11585, 8192, 5793, 4096, 2896, 2048, 1448, 1024, 724, 512, 362, 256, 181, 128, 90, 64, 45, 32, 23, 16, 11, 8, 6, 4, 3, 2, 1 }; uint32_t fast_rsqrt(uint32_t x) { if (x == 0) return ~0U; int exp = 31; while (((x >> exp) & 1) == 0 && exp > 0) exp--; uint32_t base = rsqrt_table[exp]; uint32_t next = rsqrt_table[exp + 1]; uint32_t fraction = (x - (1u << exp)) >> exp; uint32_t y = base - (__mulsi3(base - next, fraction) >> 16); newton_step(&y, x); newton_step(&y, x); return y; } uint32_t fast_distance_3d(int32_t dx, int32_t dy, int32_t dz) { uint64_t dist_sq = __muldi3(dx, dx) + __muldi3(dy, dy) + __muldi3(dz, dz); if (dist_sq > 0xFFFFFFFF) dist_sq >>= 16; uint32_t inv_dist = fast_rsqrt((uint32_t)dist_sq); uint64_t dist = __muldi3(dist_sq, inv_dist) >> 16; return (uint32_t)dist; } // --- MAIN Function (Test Framework) --- int main(void) { uint64_t start_cycles, end_cycles, cycles_elapsed; TEST_LOGGER("\n=== Q3C Fast RSQRT/Distance Test (C Baseline) ===\n"); start_cycles = get_cycles(); int32_t dx = 3, dy = 4, dz = 12; uint32_t dist = fast_distance_3d(dx, dy, dz); end_cycles = get_cycles(); cycles_elapsed = end_cycles - start_cycles; TEST_LOGGER(" Distance Result: "); print_dec((unsigned long)dist); TEST_LOGGER(" CSR Cycles: "); print_dec((unsigned long) cycles_elapsed); TEST_LOGGER("=== Test Completed ===\n"); return 0; } ``` ![image](https://hackmd.io/_uploads/SkhEvefg-x.png) ![image](https://hackmd.io/_uploads/Syorvgze-e.png) ### Q3C Performance Data Summary | Version | Compilation Flag | Cycles (CSR Cycles) |Time Reduction (vs O0)|Observation| | -------- | -------- | -------- | ---- | ---- | | C Baseline| -O0 | 3614 |0% (Initial)|High cost due to unoptimized execution of soft loops in __muldi3.| |C Optimized|-O2|1840|49.1%|GCC Self-Optimization: The compiler successfully optimized the soft-multiplication loops.| ### Compiler Experiment Observations: GCC's Optimization Efficacy Observation on CFLAGS: The total execution time of the complex ``fast_distance_3d`` function was reduced by 49.1% simply by changing the compilation flag from ``-O0`` (3614 Cycles) to ``-O2`` (1840 Cycles). Explanation: This drastic reduction proves that the main bottleneck lies within the software-implemented loops used for 64-bit arithmetic (``__muldi3``). GCC, under the ``-O2`` flag, successfully performs powerful Loop Transformations (likely Loop Unrolling) on the ``while`` loop within the ``__muldi3`` and ``umul`` helper functions. This transformation drastically reduces the number of branch instructions and loop control overhead, effectively doubling the execution speed without any manual intervention. This demonstrates the compiler's superior ability to optimize repetitive arithmetic tasks compared to the simple, linear code produced by ``-O0``. ## 5. Disassembly and Final Analysis ![image](https://hackmd.io/_uploads/HyiYdeGxZg.png) This section contrasts the machine code generated by the C compiler (``-O0``) against the hand-written, optimized RISC-V assembly code (``q2a_hanoi.S``). The goal is to explain the source of the 93.1% Cycle reduction achieved in the arithmetic core. ### 5.1 C Baseline Disassembly: Function Call Overhead (``umod``) The C compiler (GCC, ``-O0``) produced minimal instructions for the overall structure but retained a massive performance bottleneck by treating the modulo operator (``%``) as an external function call. The core of the C bottleneck is seen where the result of (``from + 1``) is computed and passed to the ``umod`` function: | Instruction Address | Instruction | Comments | | -------- | -------- | -------- | | ``80000404`` | ``li a1, 3 `` | ``a1 = 3 ``(Divisor argument) | | ``80000408 `` | ``mv a0, a5 `` | ``a0 = from + 1`` (Dividend argument) | | ``8000040c`` | ``jal 80000100 <umod>`` | CRITICAL BOTTLENECK: Jump to external function.) | | ``80000410`` | ``mv a5, a0`` | ``a5`` gets return value from ``umod`` | Observation: The simple integer modulo operation (``% 3``) requires a full function call, stack setup/teardown (in ``umod``), and a software loop running 32 times (consuming hundreds of cycles per call). This overhead directly translates to the high 2,988 Cycles measured for the pure arithmetic C baseline. ### 5.2 Hand-Written Optimized Assembly Disassembly The hand-written assembly (``q2a_hanoi.S``) eliminated this function call by implementing the specialized modulo logic inline. The optimization targets the ``DISK_NON_ZERO`` logic, which corresponds to ``to = (from + 1) % 3``. | Instruction Address | Instruction | Comments | | -------- | -------- | -------- | | ``800000bc`` | ``addi a3, a2, 1 `` |``a3 = from + 1`` (Equivalent to C's ``from + 1``) | | ``800000c0 `` | ``blt a3, t0, 800000d8 <CALC_TO_DONE>`` | Inline Optimization: If ``a3 < 3 ``(t0=3), skip subtraction.| | ``800000c4`` | ``sub a3, a3, t0`` | In-place Modulo: ``a3 = a3 - 3`` (If a3 >= 3). | | ``800000c8`` | ``j 800000d8 <CALC_TO_DONE>`` | Jump to the next stage.| Final Analysis & Conclusion: Efficiency Gain: The hand-written assembly replaced a single, highly inefficient function call (``umod``, ~200+ cycles per call) with a sequence of 3 conditional instructions (one conditional branch and one subtraction). Performance: This substitution is the singular reason the Cycles dropped from 2988 to 207 (a 93.1% reduction in arithmetic time). Result: This exercise confirms that in the severely constrained RV32I bare-metal environment, manually eliminating redundant function call overhead for simple arithmetic operations is the most effective optimization technique, far surpassing the capabilities of standard GCC optimization flags.