Assignment2: RISC-V Complete Application

[toc] # Assignment2: RISC-V Complete Application contributed by [< jningmin >](https://github.com/jningmin) # Pick a question from hw1 The following question is picked from the Assignment 1 : **leetcode_70_climbing_stairs** >leetcode_70_climbing_stairs You are climbing a staircase. It takes n steps to reach the top. Each time you can either climb 1 or 2 steps. In how many distinct ways can you climb to the top? [C version of Source code](https://hackmd.io/kvq6kXeRRmeeJGepGiX1Mg#HW1_mainc) Implements is referance to assignment 1 leetcode70 [version2 ](https://hackmd.io/_0UERjMoRcaRxV6aGN5fkw?stext=29986%3A2533%3A0%3A1762768139%3Acymaty&both=) In a **bare-metal environment** using rv32emu’s system emulation you have to provide M extensions in I extension The following are the algebraic mathematical functions we will need to use additionally. ```c /* Software division for RV32I (no M extension) */ static unsigned long udiv(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long quotient = 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; quotient |= (1UL << i); } } return quotient; } static unsigned long umod(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; } } return remainder; } /* Software multiplication for RV32I (no M extension) */ static uint32_t umul(uint32_t a, uint32_t b) { uint32_t result = 0; while (b) { if (b & 1) result += a; a <<= 1; b >>= 1; } return result; } ``` The following are the basic **vector implementation** methods. ```c void vector_init(vector_t *v) { v->data = NULL; v->size = 0; v->count = 0; v->free_slot = 0; } int32_t vector_push(vector_t *v, void *ptr) { if (!v->size) { v->size = VECTOR_MIN_SIZE; v->data = calloc(v->size, sizeof(void *)); } if (v->free_slot && v->free_slot < v->count) { size_t idx = v->free_slot; v->data[idx] = ptr; v->free_slot = 0; return idx; } if (v->count == v->size) { v->size *= 2; v->data = realloc(v->data, v->size * sizeof(void *)); memset(v->data + v->count, 0, (v->size - v->count) * sizeof(void *)); } v->data[v->count] = ptr; return v->count++; } void *vector_pop(vector_t *v) { if (!v->count) return NULL; void *last = v->data[--v->count]; v->data[v->count] = NULL; return last; } void vector_free(vector_t *v) { if (!v->data) return; free(v->data); v->data = NULL; v->size = 0; v->count = 0; v->free_slot = 0; } ``` The followings are main steps combined functions. ```c void generate_combinations(int n, vector_t *current) { if (n == 0) { TEST_LOGGER("["); for (size_t i = 0; i < current->count; i++) { print_dec((unsigned long)(intptr_t)current->data[i]); if (i < current->count - 1) TEST_LOGGER(","); } TEST_LOGGER("]\n"); return; } if (n >= 1) { vector_push(current, (void *)(intptr_t)1); generate_combinations(n - 1, current); vector_pop(current); } if (n >= 2) { vector_push(current, (void *)(intptr_t)2); generate_combinations(n - 2, current); vector_pop(current); } } void stairs(int n) { if (n <= 0) { TEST_LOGGER("No stairs to climb.\n"); return; } TEST_LOGGER("Climbing "); print_dec((unsigned long)n); TEST_LOGGER(" stairs:\n"); vector_t path; vector_init(&path); generate_combinations(n, &path); vector_free(&path); } ``` The following is the print decimal function ```c static void print_dec(unsigned long val) { char buf[20]; char *p = buf + sizeof(buf) - 1; *p = '\0'; p--; if (val == 0) { *p = '0'; p--; } else { while (val > 0) { *p = '0' + umod(val, 10); p--; val = udiv(val, 10); } } p++; printstr(p, (buf + sizeof(buf) -1 - p)); } ``` The Hand Written Assembly [ASM version of Source code](https://hackmd.io/kvq6kXeRRmeeJGepGiX1Mg?both#HW1_stairsS) Implements is referance to assignment 1 leetcode70 [version2 ](https://hackmd.io/_0UERjMoRcaRxV6aGN5fkw#Assembly-code--version_2) The main `difference` between RIPES and system simulation using RV32EMU is: 1.You have to set a stack_top at _start ```s .globl _start _start: la sp, __stack_top call main li a7, EXIT ecall ``` 2.Completely different output method ```s li a0,STDOUT la a1, msg li a2,18 li a7, WRITE ecall ``` I chose this question because it fits the Riscvi pattern perfectly, making it suitable for learning a simulator that you're just starting out with. **In a bare-metal environment, program behavior is similar to that of a regular compiler, but the presentation is different.** In a bare-metal environment, there is no operating system or standard C library to handle dynamic memory, so we must implement` malloc`, `free`, `realloc`, and `calloc` ourselves. ```c void *malloc(size_t size) { if (size == 0) return NULL; /* align to 8 bytes */ size = (size + 7) & ~((size_t)7); if (__simple_heap_top + size > SIMPLE_HEAP_SIZE) return NULL; void *p = &__simple_heap[__simple_heap_top]; __simple_heap_top += size; return p; } void free(void *ptr) { /* no-op for bump allocator */ (void)ptr; } void *realloc(void *ptr, size_t size) { if (ptr == NULL) return malloc(size); if (size == 0) { free(ptr); return NULL; } void *newp = malloc(size); if (!newp) return NULL; memcpy(newp, ptr, size); return newp; } void *calloc(size_t nmemb, size_t size) { size_t total = nmemb * size; void *p = malloc(total); if (p) { unsigned char *q = p; for (size_t i = 0; i < total; ++i) q[i] = 0; } return p; } ``` ## hw1_c output after compiled ```s #make run ../../../build/rv32enu test.elf 16:06:27 INFO src/riscv.c:552: Log level: TRACE 16:06:27 INFO src/riscv.c:565: test.elf ELF loaded 16:06:27 INFO src/main.c: 315: RISC-V emulator is created and ready to run == Lab2_Homeworkl Tests === Test 1: n = 3 Climbing 3 stairs: [1,1,1] [1,2] [2,1] Cycles: 11689 Instructions: 11689 Test 2: n = 4 Climbing 4 stairs: [1,1,1,1] [1,1,2] [1,2,1] [2,1,1] [2,2] Cycles: 22371 Instructions: 22371 Test 3:n = 5 Climbing 5 stairs: [1,1,1,2] [1,1,2,1] [1,2,1,1] [1,2,2] [2,1,1,1] [2,1,2] [2,2,1] Cycles: 42258 Instructions: 42258 — ALL Tests Completed == 16:06:27 INFO src/main.c:338: RISC-V emulator ``` **Compiling using RV32I bare-metal programming** To compile the .c file, I use the following command. ```c make riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -O2 -c main.c -o main.o riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -nostartfiles -T linker.ld main.o -o test.elf ``` ## hw1_S output after compiled ```s #make тип ../../../build/rv32emu test.elf 21:08:35 INFO src/riscv.c:552: Log level: TRACE 21:08:35 INFO src/riscv.c:565: test.elf ELF loaded 21:08:35 INFO src/main.c:315: RISC-V emulator is created and ready to run How many stairs? n = 3 Step combinations: { 1 1 1 } { 1 2 ] { 1 2 } Cycles: 631 п = 4 Step combinations: { 1 1 1 1 } { 1 1 2 } { 1 2 1 } { 2 1 1 } { 2 2 } CycLes: 1250 n = 5 Step combinations: { 1 1 1 1 1 } { 1 1 1 2 } { 1 1 2 1 } { 1 2 1 1 } { 1 2 2 } { 2 1 1 1 } { 2 1 2 } { 2 2 1 } CycLes: 2352 21:08:35 INFO src/main.c: 338: RISC-V emulator is destroyed ``` **Compiling using RV32I bare-metal programming** To compile the .S file, I use the following command. ```c make riscv-none-elf-ld -T linker.ld -o test.elf perfcounter.o stairs.o riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -nostartfiles -T linker.ld perfcounter.o stairs.o -o test.elf ``` From the two outputs above, we can conclude that the cycle time using C version is significantly longer than RISC-V version. # Quiz2_problem_A This program approaches the Tower of Hanoi puzzle with an iterative strategy. It leverages Gray codes, where each code sequence acts as steps along a Hamiltonian path on an n-dimensional hypercube, with each bit representing a position or coordinate. [C version of Source code](https://hackmd.io/kvq6kXeRRmeeJGepGiX1Mg?both#quiz2_mainc) We got the same print decimal, algebraic mathematical functions as leet_code_70 The main functions of Hanoi algorithm and Gray code transfer ```c unsigned char gray_code_trans(unsigned char n) { return n ^ (n >> 1); } void hanoi_with_gray(int n, int* moves) { unsigned char current, previous = 0; for (int i = 1; i <= (1 << n) - 1; i++) { current = gray_code_trans(i); unsigned char diff = current ^ previous; // find change bit int disk = 0; while ((diff & (1 << disk)) == 0) { disk++; } int from_peg = moves[disk]; int to_peg; if (disk == 0) { // smallest disk int small_shift = (umod((unsigned long)n, 2) == 0) ? 1 : 2; to_peg = umod((unsigned long)(moves[0] + small_shift + 3), 3); } else { int tmp = 3 - moves[disk] - moves[0]; to_peg = umod((unsigned long)(tmp + 3), 3); } TEST_LOGGER("Move disk "); char disk_letter = 'A' + disk; TEST_OUTPUT(&disk_letter, 1); TEST_LOGGER(" from peg "); print_dec(from_peg + 1); TEST_LOGGER(" to peg "); print_dec(to_peg + 1); TEST_LOGGER("\n"); moves[disk] = to_peg; previous = current; } } ``` [ASM version of Source code](https://hackmd.io/kvq6kXeRRmeeJGepGiX1Mg?both#quiz2_hanoi_graycodeS) `hanoi:`Initializes the loop limit (1<<n)-1 and iteratively generates each move. Each iteration converts the step number to a Gray code, compares it with the previous value, and determines which disk changed. ```s hanoi: # bound = (1 << n) - 1 li t0, 1 sll s4, t0, s1 # s4 = 1 << n addi s4, s4, -1 # s4 = (1<<n) - 1 li t1, 1 # i = 1 li s2, 0 # previous = 0 hanoi_loop: bgt t1, s4, hanoi_end # current = gray_code_trans(i) mv a0, t1 jal ra, gray_code_trans mv s3, a0 # s3 = current xor t2, s3, s2 # diff = current ^ previous li t3, 0 #disk ``` `find_disk:`Finds the disk corresponding to the flipped bit (difference between current and previous Gray code). Then determines the source peg and computes the destination peg based on Hanoi movement rules (different logic for the smallest disk vs. larger ones). ```s find_disk: li t4, 1 sll t4, t4, t3 # mask = 1 << t3 and t5, t2, t4 bnez t5, disk_found addi t3, t3, 1 j find_disk disk_found: # t3 = disk index (0=A,1=B,2=C) # load current pos of that disk la t6, moves slli t4, t3, 2 add t6, t6, t4 lw t0, 0(t6) # t0 = from_peg (0..2) mv s5, t0 # save from peg in s5 # compute to_peg ,t0 will be overwritten with target # if disk == 0 , smallest disk beq t3, x0, handle_small # else (larger disk) la t6, moves lw s7, 0(t6) # t7 = moves[0] (ref) li s8, 3 sub s8, s8, t0 # t8 = 3 - cur sub s8, s8, s7 # t8 = 3 - cur - ref # normalize t8 into 0..2 blt s8, x0, sub_add3_larger li s9, 3 blt s8, s9, store_target_large sub s8, s8, s9 j store_target_large sub_add3_larger: addi s8, s8, 3 store_target_large: mv t0, s8 j do_print_and_store handle_small: # small_shift = (n % 2 == 0) ? 1 : 2 andi t6, s1, 1 # t6 = n & 1 beq t6, x0, small_shift_is_one li t6, 2 j small_shift_done small_shift_is_one: li t6, 1 small_shift_done: add s8, t0, t6 # t8 = moves[0] + small_shift li s9, 3 blt s8, s9, store_target_small sub s8, s8, s9 store_target_small: mv t0, s8 ``` `gray_code_trans:` Converts an integer i into Gray code using: gray = i ^ (i >> 1) This ensures only one disk changes per step (key to iterative Hanoi). ```s gray_code_trans: mv t0, a0 srli s10, a0, 1 xor a0, t0, s10 mv t0,x0 ret ``` `print_int:` Prints an integer without using hardware division. The number is repeatedly reduced by 10 to extract digits, converts each digit to ASCII, and outputs using a system call. ```s print_int: addi sp, sp, -20 sw ra, 0(sp) sw t0, 4(sp) sw t1, 8(sp) sw t2, 12(sp) sw t3, 16(sp) la t0, buffer addi t1, t0, 10 sb x0, 0(t1) # '\0' beqz a0, print_zero_I print_int_loop_I: # --- 計算 a0 / 10 與餘數 --- li t3, 0 # q = t3 li t4, 0 # rem = t4 li t2, 10 div_loop_I: blt a0, t2, div_end_I # a0 < 10 addi a0, a0, -10 addi t3, t3, 1 # q + 1 j div_loop_I div_end_I: mv t4, a0 # rem mv a0, t3 # q = a0 addi t4, t4, 48 # ASCII addi t1, t1, -1 sb t4, 0(t1) bnez a0, print_int_loop_I # --- OUTPUT --- li a0, 1 mv a1, t1 la t2, buffer addi t2, t2, 10 sub a2, t2, t1 li a7, 64 ecall j print_int_end_I print_zero_I: li t2, 48 sb t2, 0(t0) li a0, 1 la a1, buffer li a2, 1 li a7, 64 ecall print_int_end_I: lw ra, 0(sp) lw t0, 4(sp) lw t1, 8(sp) lw t2, 12(sp) lw t3, 16(sp) addi sp, sp, 20 ret ``` ## quiz2_hanoi_gray_bare.c output after compiled ```s ＃make run ../../. ./build/rv32emu test.elf 12:41:24 INFO STC/riscv.c:552: Log level: TRACE 12:41:24 INFO src/riscv.c:565: test.elf ELF Loaded 12:41:24 INFO src/main.c:315: RISC-V emulator is created and ready to run Moving 3 disks using Gray code sequence: Move disk A from peg 1 to peg 3 Move disk B from peg 1 to peg 2 Move disk A from peg 3 to peg 2 Move disk C from peg 1 to peg 3 Move disk A from peg 2 to peg 1 Move disk B from peg 2 to peg 3 Move disk A from peg 1 to peg 3 Cycles: 25199 Instructions: 25199 12:41:24 INFO src/main.c:338: RISC-V emulator is destroyed ``` **Compiling using RV32I bare-metal programming** To compile the .c file, I use the following command. ```c make riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -O2 -c main.c -o main.o riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -nostartfiles -T linker.ld main.o -o main.elf ``` ## quiz2_hanoi_gray_bare.S output after compiled ```s #make run ../../../build/rv32emu test.elf 14:57:17 INFO src/riscv.c: 552: Log Level: TRACE 14:57:17 INFO src/riscv.c: 565: test.elf ELF Loaded 14:57:17 INFO src/main.c:315: RISC-V emulator is created and ready to run Moving 3 disks using Gray code sequence: Move disk A from peg 1 to peg 3 Move disk B from peg 1 to peg 2 Move disk A from peg 3 to peg 2 Move disk C from peg 1 to peg 3 Move disk A from peg 2 to peg 1 Move disk B from peg 2 to peg 3 Move disk A from peg 1 to peg 3 Cycles: 704 14:57:17 INFO src/main.c:338: RISC-V emulator is destroyed ``` **Compiling using RV32I bare-metal programming** To compile the .S file, I use the following command. ```c make riscv-none-elf-ld -T linker.ld -o test.elf perfcounter.o hanoi_gray_bare.o riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -nostartfiles -T linker.ld perfcounter.o hanoi_gray_bare.o -o test.elf ``` From the two outputs above, we can conclude that the cycle time using C version is significantly longer than RISC-V version. # Quiz3_problem_C `rsqrt` implements a fast reciprocal square root on RV32I, optimized to run without a hardware multiplier. It uses a LUT-based initial guess followed by Newton iterations to refine accuracy. All multiplications are performed using shift-add logic. The result achieves roughly 3–8% error, balancing speed and precision. ![image](https://hackmd.io/_uploads/S1iVk001Zg.png=150x) Final formula: $$ y_{n+1} = y_n \left(\frac{3 - xy_n^2}{2}\right) = y_n \left(\frac{3}{2} - \frac{xy_n^2}{2}\right) $$ [C version of Source code](https://hackmd.io/kvq6kXeRRmeeJGepGiX1Mg?both#quiz3_mainc) `fast_rsqrt():`Computes an approximate reciprocal square root (1 / sqrt(x)) in Q16 fixed-point, optimized for RV32I. >Steps: Get exponent of x using clz() Use lookup table (rsqrt_table) to get initial guess Optional linear interpolation makes the guess more accurate Performs two iterations of Newton's method Newton update formula (fixed-point version): y = y * (1.5 – 0.5 * x * y²) Very fast and avoids float hardware. ```C uint32_t fast_rsqrt(uint32_t x) { if (x == 0) return 0xFFFFFFFF; /* saturate */ if (x == 1) return (1U << SCALE); /* 1.0 in Q16 */ /* exponent = floor(log2(x)) */ int exp = 31 - clz(x); uint32_t y = rsqrt_table[exp]; /* Q16 initial */ /* linear interpolation for better initial guess */ if (x > (1u << exp)) { uint32_t y1 = (exp < 31) ? rsqrt_table[exp + 1] : 0; uint32_t diff = (y > y1) ? (y - y1) : 0; uint32_t frac = (uint32_t)((((uint64_t)x - (1ULL << exp)) << SCALE) >> exp); /* Q16 fraction */ y -= (uint32_t)( (mul32(diff, frac)) >> SCALE ); /* keep Q16 */ } for (int i = 0; i < 2; i++) { uint64_t y2 = mul32(y, y); /* Q32 */ uint32_t y2_hi = (uint32_t)(y2 >> 16); /* Q16 */ uint64_t xy2 = mul32(x, y2_hi); /* A_Q16 */ uint64_t three_q16 = (uint64_t)3U << SCALE; /* 3 in Q16 */ /* compute B_Q16 = ((3<<16) - xy2) >> 1 (still fits <= 32 bits for valid inputs) */ uint64_t tmp; if (xy2 >= three_q16) tmp = 0; else tmp = (three_q16 - xy2) >> 1; uint32_t B_q16 = (uint32_t)tmp; y = (uint32_t)( mul32(y, B_q16) >> SCALE ); /* Q16 */ } return y; } ``` `uint32_t fast_distance_3d():`Computes distance = sqrt(dx² + dy² + dz²) but avoids real square root. >Steps: Take absolute values of dx, dy, dz Compute squared distance: dx² + dy² + dz² Scale down if overflow risk Compute 1/sqrt(dist_sq) via fast_rsqrt() Multiply result back to recover sqrt() Faster than integer sqrt, especially on RV32I. ```C uint32_t fast_distance_3d(int32_t dx, int32_t dy, int32_t dz) { /* absolute values */ dx = (dx < 0) ? -dx : dx; dy = (dy < 0) ? -dy : dy; dz = (dz < 0) ? -dz : dz; /* sum of squares (use 64-bit) */ uint64_t dist_sq = mul32((uint32_t)dx, (uint32_t)dx) + mul32((uint32_t)dy, (uint32_t)dy) + mul32((uint32_t)dz, (uint32_t)dz); uint32_t scaled_dist_sq; int shift = 0; if (dist_sq > 0xFFFFFFFFULL) { scaled_dist_sq = (uint32_t)(dist_sq >> 16); /* scale down */ shift = 8; /* will multiply result by 2^8 later */ } else { scaled_dist_sq = (uint32_t)dist_sq; } uint32_t inv_sqrt = fast_rsqrt(scaled_dist_sq); /* Q16 */ uint64_t result = mul32(scaled_dist_sq, inv_sqrt) >> SCALE; /* back to Q0 (integer) */ if (shift) result <<= shift; return (result > UINT32_MAX) ? UINT32_MAX : (uint32_t)result; } ``` `mul32()/udiv64()`: These functions exist because RV32I does not have a hardware multiplier (no M-extension). ```C static uint64_t mul32(uint32_t a, uint32_t b) { uint64_t result = 0; for (int i = 0; i < 32; i++) { if (b & (1U << i)) { result += (uint64_t)a << i; } } return result; } static uint64_t udiv64(uint64_t dividend, uint32_t divisor) { if (divisor == 0) return (uint64_t)-1; uint64_t quotient = 0; uint64_t rem = 0; for (int i = 63; i >= 0; i--) { rem = (rem << 1) | ((dividend >> i) & 1ULL); if (rem >= divisor) { rem -= divisor; quotient |= (1ULL << i); } } return quotient; } ``` `clz()`:Counts the number of leading zeros in a 32-bit integer. Used to compute log2(x) efficiently, which determines the magnitude of x during rsqrt. ```C static int clz(uint32_t x){ if (!x) return 32; int n = 0; if ((x & 0xFFFF0000) == 0) { n += 16; x <<= 16; } if ((x & 0xFF000000) == 0) { n += 8; x <<= 8; } if ((x & 0xF0000000) == 0) { n += 4; x <<= 4; } if ((x & 0xC0000000) == 0) { n += 2; x <<= 2; } if ((x & 0x80000000) == 0) { n += 1; } return n; } ``` `my_abs() / f_abs()`:Computes absolute values using bit manipulation instead of branching. my_abs() handles signed integers; f_abs() returns the absolute difference between two unsigned integers. ```C static int32_t my_abs(int32_t x) { int32_t mask = x >> 31; return (x + mask) ^ mask; } static uint32_t f_abs(uint32_t x, uint32_t y) { return (x >= y) ? (x - y) : (y - x); } ``` `rsqrt_table[32]`:A precomputed Q16 fixed-point lookup table storing an approximate initial value of 1/sqrt(x) based on the exponent of x. This drastically reduces the number of Newton iterations required. ```C static const uint16_t rsqrt_table[32] = { 65535U, 46341U, 32768U, 23170U, 16384U, 11585U, 8192U, 5793U, 4096U, 2896U, 2048U, 1448U, 1024U, 724U, 512U, 362U, 256U, 181U, 128U, 90U, 64U, 45U, 32U, 23U, 16U, 11U, 8U, 6U, 4U, 3U, 2U, 1U }; ``` `my_sqrt():`Bit-wise integer square root implementation. Useful for comparison or fallback, but slower than fast_rsqrt(). ```C static uint32_t my_sqrt(uint32_t x) { if (x == 0) return 0; uint32_t result = 0; uint32_t bit = 1U << 30; while (bit > x) bit >>= 2; while (bit != 0) { if (x >= result + bit) { x -= result + bit; result = (result >> 1) + bit; } else { result >>= 1; } bit >>= 2; } return result; } ``` [ASM version of Source code]() ## quiz3_rsqrt_bare.c output after compiled ```S #make run ../../../build/rv32emu test.elf 15:51:17 INFO src/riscv.c:552: Log Level: TRACE 15:51:17 INFO src/riscv.c:565: test.elf ELF Loaded 15:51:17 INFO src/main.c:315: RISC-V emulator is created and ready to run === Part 1: Reciprocal Square Root Comparison === x = 4 fast = 0.5000 expected = 0.5000 error = 0.000% Cycles: 2865 Instructions: 2865 x = 2000 fast = 0.0223 expected = 0.0223 error = 0.000% Cycles: 4313 Instructions: 4313 x = 50000 fast = 0.0058 expected = 0.0044 error = 30.716% Cycles: 4050 Instructions: 4050 = Part 2: 3D Distance Approxination (scaled correctly) == x = (3,4,12) fast = 13 expected = 13 error = 0% Cycles: 6252 Instructions: 6252 x = (10,20,30) fast = 37 expected = 37 error = 0% Cycles: 5906 Instructions: 5906 x = (100,200,300) fast = 245 expected = 374 error = 34% CycLes: 5343 Instructions: 5343 15:51:17 INFO src/main.c: 338: RISC-V emulator is destroyed ``` **Compiling using RV32I bare-metal programming** ```c make riscv-none-elf-ld -T linker.ld start.o perfcounter.o rsqrt_bare.o -o test.elf riscv-none-elf-gcc -march=rv32i_zicsr -mabi=ilp32 -nostartfiles -T linker.ld start.o perfcounter.o rsqrt_bare.o -o test.elf ``` From the above output, we can conclude that a larger test case size does not necessarily mean a longer cycle time, but a larger test case size may result in a higher percentage of errors. # Full code ## HW1_main.c ```c #include <stdbool.h> #include <stdint.h> #include <string.h> #define SIMPLE_HEAP_SIZE (16 * 1024) static unsigned char __simple_heap[SIMPLE_HEAP_SIZE]; static size_t __simple_heap_top = 0; void *malloc(size_t size) { if (size == 0) return NULL; /* align to 8 bytes */ size = (size + 7) & ~((size_t)7); if (__simple_heap_top + size > SIMPLE_HEAP_SIZE) return NULL; void *p = &__simple_heap[__simple_heap_top]; __simple_heap_top += size; return p; } void free(void *ptr) { /* no-op for bump allocator */ (void)ptr; } void *realloc(void *ptr, size_t size) { if (ptr == NULL) return malloc(size); if (size == 0) { free(ptr); return NULL; } void *newp = malloc(size); if (!newp) return NULL; memcpy(newp, ptr, size); return newp; } void *calloc(size_t nmemb, size_t size) { size_t total = nmemb * size; void *p = malloc(total); if (p) { unsigned char *q = p; for (size_t i = 0; i < total; ++i) q[i] = 0; } return p; } void *memset(void *s, int c, size_t n) { unsigned char *p = s; unsigned char uc = (unsigned char)c; for (size_t i = 0; i < n; ++i) p[i] = uc; return s; } #define VECTOR_MIN_SIZE 16 typedef struct { void **data; size_t size; /* Allocated size */ size_t count; /* Number of elements */ size_t free_slot; /* Index of a known hole */ } vector_t; #define printstr(ptr, length) \ do { \ asm volatile( \ "add a7, x0, 0x40;" \ "add a0, x0, 0x1;" /* stdout */ \ "add a1, x0, %0;" \ "mv a2, %1;" /* length character */ \ "ecall;" \ : \ : "r"(ptr), "r"(length) \ : "a0", "a1", "a2", "a7"); \ } while (0) #define TEST_OUTPUT(msg, length) printstr(msg, length) #define TEST_LOGGER(msg) \ { \ char _msg[] = msg; \ TEST_OUTPUT(_msg, sizeof(_msg) - 1); \ } extern uint64_t get_cycles(void); extern uint64_t get_instret(void); /* Bare metal memcpy implementation */ void *memcpy(void *dest, const void *src, size_t n) { uint8_t *d = (uint8_t *) dest; const uint8_t *s = (const uint8_t *) src; while (n--) *d++ = *s++; return dest; } /* Software division for RV32I (no M extension) */ static unsigned long udiv(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long quotient = 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; quotient |= (1UL << i); } } return quotient; } static unsigned long umod(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; } } return remainder; } /* Software multiplication for RV32I (no M extension) */ static uint32_t umul(uint32_t a, uint32_t b) { uint32_t result = 0; while (b) { if (b & 1) result += a; a <<= 1; b >>= 1; } return result; } /* Provide __mulsi3 for GCC */ uint32_t __mulsi3(uint32_t a, uint32_t b) { return umul(a, b); } /* Simple integer to decimal string conversion */ static void print_dec(unsigned long val) { char buf[20]; char *p = buf + sizeof(buf); *--p = '\0'; if (val == 0) { *--p = '0'; } else { while (val > 0) { *--p = '0' + umod(val, 10); val = udiv(val, 10); } } printstr(p, (buf + sizeof(buf) -1 - p)); } /* --------Vector*/ void vector_init(vector_t *v) { v->data = NULL; v->size = 0; v->count = 0; v->free_slot = 0; } int32_t vector_push(vector_t *v, void *ptr) { if (!v->size) { v->size = VECTOR_MIN_SIZE; v->data = calloc(v->size, sizeof(void *)); } if (v->free_slot && v->free_slot < v->count) { size_t idx = v->free_slot; v->data[idx] = ptr; v->free_slot = 0; return idx; } if (v->count == v->size) { v->size *= 2; v->data = realloc(v->data, v->size * sizeof(void *)); memset(v->data + v->count, 0, (v->size - v->count) * sizeof(void *)); } v->data[v->count] = ptr; return v->count++; } void *vector_pop(vector_t *v) { if (!v->count) return NULL; void *last = v->data[--v->count]; v->data[v->count] = NULL; return last; } void vector_free(vector_t *v) { if (!v->data) return; free(v->data); v->data = NULL; v->size = 0; v->count = 0; v->free_slot = 0; } void generate_combinations(int n, vector_t *current) { if (n == 0) { TEST_LOGGER("["); for (size_t i = 0; i < current->count; i++) { print_dec((unsigned long)(intptr_t)current->data[i]); if (i < current->count - 1) TEST_LOGGER(","); } TEST_LOGGER("]\n"); return; } if (n >= 1) { vector_push(current, (void *)(intptr_t)1); generate_combinations(n - 1, current); vector_pop(current); } if (n >= 2) { vector_push(current, (void *)(intptr_t)2); generate_combinations(n - 2, current); vector_pop(current); } } void stairs(int n) { if (n <= 0) { TEST_LOGGER("No stairs to climb.\n"); return; } TEST_LOGGER("Climbing "); print_dec((unsigned long)n); TEST_LOGGER(" stairs:\n"); vector_t path; vector_init(&path); generate_combinations(n, &path); vector_free(&path); } int main(void) { uint64_t start_cycles, end_cycles, cycles_elapsed; uint64_t start_instret, end_instret, instret_elapsed; TEST_LOGGER("\n=== Lab2_Homework1 Tests ===\n"); /* Test 1:*/ TEST_LOGGER("Test 1: n = 3\n"); start_cycles = get_cycles(); start_instret = get_instret(); stairs(3); end_cycles = get_cycles(); end_instret = get_instret(); cycles_elapsed = end_cycles - start_cycles; instret_elapsed = end_instret - start_instret; TEST_LOGGER(" Cycles: "); print_dec((unsigned long) cycles_elapsed); TEST_LOGGER(" Instructions: "); print_dec((unsigned long) instret_elapsed); TEST_LOGGER("\n"); /* Test 2: */ TEST_LOGGER("Test 2: n = 4\n"); start_cycles = get_cycles(); start_instret = get_instret(); stairs(4); end_cycles = get_cycles(); end_instret = get_instret(); cycles_elapsed = end_cycles - start_cycles; instret_elapsed = end_instret - start_instret; TEST_LOGGER(" Cycles: "); print_dec((unsigned long) cycles_elapsed); TEST_LOGGER(" Instructions: "); print_dec((unsigned long) instret_elapsed); TEST_LOGGER("\n"); /* Test 3: */ TEST_LOGGER("Test 3:n = 5\n"); start_cycles = get_cycles(); start_instret = get_instret(); stairs(5); end_cycles = get_cycles(); end_instret = get_instret(); cycles_elapsed = end_cycles - start_cycles; instret_elapsed = end_instret - start_instret; TEST_LOGGER(" Cycles: "); print_dec((unsigned long) cycles_elapsed); TEST_LOGGER(" Instructions: "); print_dec((unsigned long) instret_elapsed); TEST_LOGGER("\n"); TEST_LOGGER("\n=== All Tests Completed ===\n"); return 0; } ``` ## HW1_stairs.S ```s .data .align 2 msg: .string "How many stairs?\n" .align 2 msg1: .string "Step combinations:\n" .align 2 msg2: .string "{ " .align 2 msg3: .string "}\n" .align 2 msg4: .string " " .align 2 msg5: .string "\n" .align 2 msg_cycles: .string " Cycles: " .align 2 testcase_a: .string "n = 3" .align 2 testcase_b: .string "n = 4" .align 2 testcase_c: .string "n = 5" .align 2 steps: .space 256 .align 2 buffer: .space 64 .equ STDOUT, 1 .equ WRITE, 64 .equ EXIT, 93 .text .align 2 .globl _start _start: la sp, __stack_top call main li a7, EXIT ecall main: # Print "How many stairs?" li a0,STDOUT la a1, msg li a2,18 li a7, WRITE ecall ##------------------A--------------------------- li a0, STDOUT la a1, testcase_a li a2, 5 li a7, WRITE ecall li a0, STDOUT la a1, msg5 li a2, 1 li a7, WRITE ecall li a0, STDOUT la a1, msg1 li a2, 19 li a7, WRITE ecall li t4, 3 # n = 3 mv s2, x0 mv s3, x0 ##------------cycle------------------------------- call get_cycles mv s4, a0 # low mv s5, a1 # high jal ra, printSteps call get_cycles mv t0, a0 # end low mv t1, a1 # end high # 計算差值 sub s1, t0, s4 sub s6, t1, s5 # 輸出 cycles li a0, STDOUT la a1, msg_cycles li a2, 10 li a7, WRITE ecall mv a0, s1 # low bits jal ra, print_int li a0, STDOUT la a1, msg5 li a2, 1 li a7, WRITE ecall ##------------------B--------------------------- li a0, STDOUT la a1, msg5 li a2, 1 li a7, WRITE ecall li a0, STDOUT la a1, testcase_b li a2, 5 li a7, WRITE ecall li a0, STDOUT la a1, msg5 li a2, 1 li a7, WRITE ecall li a0, STDOUT la a1, msg1 li a2, 19 li a7, WRITE ecall li t4, 4 # n = 4 mv s2, x0 mv s3, x0 ##------------cycle------------------------------- call get_cycles mv s4, a0 # low mv s5, a1 # high jal ra, printSteps call get_cycles mv t0, a0 # end low mv t1, a1 # end high # 計算差值 sub s1, t0, s4 sub s6, t1, s5 # 輸出 cycles li a0, STDOUT la a1, msg_cycles li a2, 10 li a7, WRITE ecall mv a0, s1 # low bits jal ra, print_int li a0, STDOUT la a1, msg5 li a2, 1 li a7, WRITE ecall ##------------------C--------------------------- li a0, STDOUT la a1, msg5 li a2, 1 li a7, WRITE ecall li a0, STDOUT la a1, testcase_c li a2, 5 li a7, WRITE ecall li a0, STDOUT la a1, msg5 li a2, 1 li a7, WRITE ecall li a0, STDOUT la a1, msg1 li a2, 19 li a7, WRITE ecall li t4, 5 # n = 5 mv s2, x0 mv s3, x0 ##------------cycle------------------------------- call get_cycles mv s4, a0 # low mv s5, a1 # high jal ra, printSteps call get_cycles mv t0, a0 # end low mv t1, a1 # end high # 計算差值 sub s1, t0, s4 sub s6, t1, s5 # 輸出 cycles li a0, STDOUT la a1, msg_cycles li a2, 10 li a7, WRITE ecall mv a0, s1 # low bits jal ra, print_int li a0, STDOUT la a1, msg5 li a2, 1 li a7, WRITE ecall end: li a7, EXIT ecall printSteps: bnez t4, L1 printSteps_0: addi s2, s2, 1 # Print "{ " li a0, STDOUT la a1, msg2 li a2, 2 li a7, WRITE ecall # loop print steps addi sp, sp, -4 sw ra, 0(sp) li t0, 0 loop_0: la a1,steps slli t2, t0, 2 add t1, a1, t2 lw t3, 0(t1) mv a0, t3 jal ra, print_int li a0, STDOUT la a1, msg4 li a2, 1 li a7, WRITE ecall addi t0, t0, 1 bltu t0, s3, loop_0 # Print "}\n" li a0, STDOUT la a1, msg3 li a2, 2 li a7, WRITE ecall lw ra, 0(sp) addi sp, sp, 4 ret L1: li t0, 1 bltu t4, t0, R addi sp, sp, -16 sw ra, 0(sp) sw t4, 4(sp) sw s3, 8(sp) sw a1, 12(sp) jal ra, printSteps_1 lw ra, 0(sp) lw t4, 4(sp) lw s3, 8(sp) lw a1, 12(sp) addi sp, sp, 16 L2: li t0, 2 bltu t4, t0, R addi sp, sp, -16 sw ra, 0(sp) sw t4, 4(sp) sw s3, 8(sp) sw a1, 12(sp) jal ra, printSteps_2 lw ra, 0(sp) lw t4, 4(sp) lw s3, 8(sp) lw a1, 12(sp) addi sp, sp, 16 R: ret #----------------------------------------------- printSteps_1: la a1,steps slli t2, s3, 2 add t1, a1, t2 li t6, 1 sw t6, 0(t1) addi t4, t4, -1 addi s3, s3, 1 addi sp, sp, -12 sw ra, 0(sp) sw t4, 4(sp) sw s3, 8(sp) jal ra, printSteps lw ra, 0(sp) lw t4, 4(sp) lw s3, 8(sp) addi sp, sp, 12 ret #----------------------------------------------- printSteps_2: la a1,steps slli t2, s3, 2 add t1, a1, t2 li t6, 2 sw t6, 0(t1) addi t4, t4, -2 addi s3, s3, 1 addi sp, sp, -12 sw ra, 0(sp) sw t4, 4(sp) sw s3, 8(sp) jal ra, printSteps lw ra, 0(sp) lw t4, 4(sp) lw s3, 8(sp) addi sp, sp, 12 ret #----------------------------------------------- print_int: addi sp, sp, -20 sw ra, 0(sp) sw t0, 4(sp) sw t1, 8(sp) sw t2, 12(sp) sw t3, 16(sp) la t0, buffer # buffer 開頭 addi t1, t0, 10 # 結尾位置 sb x0, 0(t1) # 加上 '\0' beqz a0, print_zero_I # 若為 0 特殊處理 print_int_loop_I: # --- 計算 a0 / 10 與餘數 --- li t3, 0 # 商 = 0 li t4, 0 # 餘數 = 0 li t2, 10 div_loop_I: blt a0, t2, div_end_I # 若 a0 < 10，結束 addi a0, a0, -10 # 減 10 addi t3, t3, 1 # 商 + 1 j div_loop_I div_end_I: mv t4, a0 # 剩下的就是餘數 mv a0, t3 # 商成為新的 a0 addi t4, t4, 48 # 轉 ASCII addi t1, t1, -1 sb t4, 0(t1) bnez a0, print_int_loop_I # --- 輸出字串 --- li a0, 1 # STDOUT mv a1, t1 la t2, buffer addi t2, t2, 10 sub a2, t2, t1 # 長度 li a7, 64 # WRITE ecall j print_int_end_I print_zero_I: li t2, 48 sb t2, 0(t0) li a0, 1 la a1, buffer li a2, 1 li a7, 64 ecall print_int_end_I: lw ra, 0(sp) lw t0, 4(sp) lw t1, 8(sp) lw t2, 12(sp) lw t3, 16(sp) addi sp, sp, 20 ret ``` ## quiz2_main.c ```c #include <stdbool.h> #include <stdint.h> #include <string.h> #include <stdio.h> #define printstr(ptr, length) \ do { \ asm volatile( \ "add a7, x0, 0x40;" \ "add a0, x0, 0x1;" /* stdout */ \ "add a1, x0, %0;" \ "mv a2, %1;" /* length character */ \ "ecall;" \ : \ : "r"(ptr), "r"(length) \ : "a0", "a1", "a2", "a7"); \ } while (0) #define TEST_OUTPUT(msg, length) printstr(msg, length) #define TEST_LOGGER(msg) \ { \ char _msg[] = msg; \ TEST_OUTPUT(_msg, sizeof(_msg) - 1); \ } extern uint64_t get_cycles(void); extern uint64_t get_instret(void); /* Bare metal memcpy implementation */ void *memcpy(void *dest, const void *src, size_t n) { uint8_t *d = (uint8_t *) dest; const uint8_t *s = (const uint8_t *) src; while (n--) *d++ = *s++; return dest; } /* Software division for RV32I (no M extension) */ static unsigned long udiv(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long quotient = 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; quotient |= (1UL << i); } } return quotient; } static unsigned long umod(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; } } return remainder; } /* Simple integer to decimal string conversion */ static void print_dec(unsigned long val) { char buf[20]; char *p = buf + sizeof(buf) - 1; *p = '\0'; p--; if (val == 0) { *p = '0'; p--; } else { while (val > 0) { *p = '0' + umod(val, 10); p--; val = udiv(val, 10); } } p++; printstr(p, (buf + sizeof(buf) -1 - p)); } unsigned char gray_code_trans(unsigned char n) { return n ^ (n >> 1); } void hanoi_with_gray(int n, int* moves) { unsigned char current, previous = 0; for (int i = 1; i <= (1 << n) - 1; i++) { current = gray_code_trans(i); unsigned char diff = current ^ previous; // 找出改變的位元位置 int disk = 0; while ((diff & (1 << disk)) == 0) { disk++; } int from_peg = moves[disk]; int to_peg; if (disk == 0) { // smallest disk int small_shift = (umod((unsigned long)n, 2) == 0) ? 1 : 2; to_peg = umod((unsigned long)(moves[0] + small_shift + 3), 3); } else { int tmp = 3 - moves[disk] - moves[0]; to_peg = umod((unsigned long)(tmp + 3), 3); } TEST_LOGGER("Move disk "); char disk_letter = 'A' + disk; TEST_OUTPUT(&disk_letter, 1); TEST_LOGGER(" from peg "); print_dec(from_peg + 1); TEST_LOGGER(" to peg "); print_dec(to_peg + 1); TEST_LOGGER("\n"); moves[disk] = to_peg; previous = current; } } int main() { uint64_t start_cycles, end_cycles, cycles_elapsed; uint64_t start_instret, end_instret, instret_elapsed; int n = 3; //max_steps =2^n int moves[8] = {0}; TEST_LOGGER("Moving "); print_dec(n); TEST_LOGGER(" disks using Gray code sequence:\n"); start_cycles = get_cycles(); start_instret = get_instret(); hanoi_with_gray(n, moves); end_cycles = get_cycles(); end_instret = get_instret(); cycles_elapsed = end_cycles - start_cycles; instret_elapsed = end_instret - start_instret; TEST_LOGGER(" Cycles: "); print_dec((unsigned long) cycles_elapsed); TEST_LOGGER(" Instructions: "); print_dec((unsigned long) instret_elapsed); TEST_LOGGER("\n"); return 0; } ``` ## quiz2_hanoi_graycode.S ```s .data .align 2 msg_hdr: .string "Moving 3 disks using Gray code sequence:\n" .align 2 moving_msg: .string "Move disk " .align 2 msgA: .string "A" .align 2 msgB: .string "B" .align 2 msgC: .string "C" .align 2 msg_from: .string " from peg " .align 2 msg_to: .string " to peg " .align 2 newline: .string "\n" .align 2 msg_cycles: .string " Cycles: " .align 2 buffer: .word 32 # temporary 1-byte buffer at buffer .align 2 moves: .word 0,0,0,0,0,0,0,0 # 8 words, initialized to 0 .equ STDOUT, 1 .equ WRITE, 64 .equ EXIT, 93 .text .align 2 .globl _start _start: la sp, __stack_top call main li a7, EXIT li a0, 0 ecall main: # print header li a0, STDOUT la a1, msg_hdr li a2, 41 li a7, WRITE ecall # n = 3 li s1, 3 ##------------cycle------------------------------- call get_cycles mv s6, a0 # low mv s11, a1 # high hanoi: # bound = (1 << n) - 1 li t0, 1 sll s4, t0, s1 # s4 = 1 << n addi s4, s4, -1 # s4 = (1<<n) - 1 li t1, 1 # i = 1 li s2, 0 # previous = 0 hanoi_loop: bgt t1, s4, hanoi_end # current = gray_code_trans(i) mv a0, t1 jal ra, gray_code_trans mv s3, a0 # s3 = current xor t2, s3, s2 # diff = current ^ previous li t3, 0 #disk find_disk: li t4, 1 sll t4, t4, t3 # mask = 1 << t3 and t5, t2, t4 bnez t5, disk_found addi t3, t3, 1 j find_disk disk_found: # t3 = disk index (0=A,1=B,2=C) # load current pos of that disk la t6, moves slli t4, t3, 2 add t6, t6, t4 lw t0, 0(t6) # t0 = from_peg (0..2) mv s5, t0 # save from peg in s5 # compute to_peg (t0 will be overwritten with target) # if disk == 0 -> smallest disk rule beq t3, x0, handle_small # else (larger disk): to_peg = 3 - cur - moves[0]; then normalize to 0..2 la t6, moves lw s7, 0(t6) # t7 = moves[0] (ref) li s8, 3 sub s8, s8, t0 # t8 = 3 - cur sub s8, s8, s7 # t8 = 3 - cur - ref # normalize t8 into 0..2 blt s8, x0, sub_add3_larger li s9, 3 blt s8, s9, store_target_large sub s8, s8, s9 j store_target_large sub_add3_larger: addi s8, s8, 3 store_target_large: mv t0, s8 j do_print_and_store handle_small: # small_shift = (n % 2 == 0) ? 1 : 2 andi t6, s1, 1 # t6 = n & 1 beq t6, x0, small_shift_is_one li t6, 2 j small_shift_done small_shift_is_one: li t6, 1 small_shift_done: add s8, t0, t6 # t8 = moves[0] + small_shift li s9, 3 blt s8, s9, store_target_small sub s8, s8, s9 store_target_small: mv t0, s8 do_print_and_store: # Print "Move disk " li a0, STDOUT la a1, moving_msg li a2, 10 li a7, WRITE ecall # Print disk letter (A/B/C) # choose correct label li t6, 0 beq t3, t6, print_A li t6, 1 beq t3, t6, print_B # default -> C li a0, STDOUT la a1, msgC li a2, 1 li a7, WRITE ecall j print_from_label print_A: li a0, STDOUT la a1, msgA li a2, 1 li a7, WRITE ecall j print_from_label print_B: li a0, STDOUT la a1, msgB li a2, 1 li a7, WRITE ecall print_from_label: # Print " from peg " li a0, STDOUT la a1, msg_from li a2, 10 li a7, WRITE ecall # print from peg number (1-based) as single char li t6, 49 #'1' add t6, t6, s5 # '1' + from (s5) la s7, buffer sb t6, 0(s7) li a0, STDOUT la a1, buffer li a2, 1 li a7, WRITE ecall # Print " to peg " li a0, STDOUT la a1, msg_to li a2, 8 li a7, WRITE ecall # print to peg number (1-based) as single char - t0 holds target li t6, 49 #'1' add t6, t6, t0 la s7, buffer sb t6, 0(s7) li a0, STDOUT la a1, buffer li a2, 1 li a7, WRITE ecall # newline li a0, STDOUT la a1, newline li a2, 1 li a7, WRITE ecall # store moves[disk] = target (t0) la t6, moves slli t4, t3, 2 add t6, t6, t4 sw t0, 0(t6) # prepare next step mv s2, s3 # previous = current addi t1, t1, 1 j hanoi_loop hanoi_end: call get_cycles mv t0, a0 # end low mv t1, a1 # end high # 計算差值 sub s1, t0, s6 sub s6, t1, s11 # 輸出 cycles li a0, STDOUT la a1, msg_cycles li a2, 10 li a7, WRITE ecall mv a0, s1 # low bits jal ra, print_int li a0, STDOUT la a1, newline li a2, 1 li a7, WRITE ecall li a7, EXIT li a0, 0 ecall gray_code_trans: mv t0, a0 srli s10, a0, 1 xor a0, t0, s10 mv t0,x0 ret print_int: addi sp, sp, -20 sw ra, 0(sp) sw t0, 4(sp) sw t1, 8(sp) sw t2, 12(sp) sw t3, 16(sp) la t0, buffer addi t1, t0, 10 sb x0, 0(t1) # '\0' beqz a0, print_zero_I print_int_loop_I: # --- 計算 a0 / 10 與餘數 --- li t3, 0 # q = t3 li t4, 0 # rem = t4 li t2, 10 div_loop_I: blt a0, t2, div_end_I addi a0, a0, -10 addi t3, t3, 1 # q + 1 j div_loop_I div_end_I: mv t4, a0 mv a0, t3 # q = a0 addi t4, t4, 48 # ASCII addi t1, t1, -1 sb t4, 0(t1) bnez a0, print_int_loop_I # --- 輸出字串 --- li a0, 1 # STDOUT mv a1, t1 la t2, buffer addi t2, t2, 10 sub a2, t2, t1 li a7, 64 # WRITE ecall j print_int_end_I print_zero_I: li t2, 48 sb t2, 0(t0) li a0, 1 la a1, buffer li a2, 1 li a7, 64 ecall print_int_end_I: lw ra, 0(sp) lw t0, 4(sp) lw t1, 8(sp) lw t2, 12(sp) lw t3, 16(sp) addi sp, sp, 20 ret ``` ## quiz3_main.c ```c #include <stdbool.h> #include <stdint.h> #include <string.h> #include <stdio.h> #define SCALE (16) #define printstr(ptr, length) \ do { \ asm volatile( \ "add a7, x0, 0x40;" \ "add a0, x0, 0x1;" /* stdout */ \ "add a1, x0, %0;" \ "mv a2, %1;" /* length character */ \ "ecall;" \ : \ : "r"(ptr), "r"(length) \ : "a0", "a1", "a2", "a7"); \ } while (0) #define TEST_OUTPUT(msg, length) printstr(msg, length) #define TEST_LOGGER(msg) \ { \ char _msg[] = msg; \ TEST_OUTPUT(_msg, sizeof(_msg) - 1); \ } extern uint64_t get_cycles(void); extern uint64_t get_instret(void); static unsigned long umod(unsigned long dividend, unsigned long divisor); static unsigned long udiv(unsigned long dividend, unsigned long divisor); static uint64_t mul32(uint32_t a, uint32_t b) { uint64_t result = 0; for (int i = 0; i < 32; i++) { if (b & (1U << i)) { result += (uint64_t)a << i; } } return result; } static const uint16_t rsqrt_table[32] = { 65535U, 46341U, 32768U, 23170U, 16384U, 11585U, 8192U, 5793U, 4096U, 2896U, 2048U, 1448U, 1024U, 724U, 512U, 362U, 256U, 181U, 128U, 90U, 64U, 45U, 32U, 23U, 16U, 11U, 8U, 6U, 4U, 3U, 2U, 1U }; static int clz(uint32_t x){ if (!x) return 32; int n = 0; if ((x & 0xFFFF0000) == 0) { n += 16; x <<= 16; } if ((x & 0xFF000000) == 0) { n += 8; x <<= 8; } if ((x & 0xF0000000) == 0) { n += 4; x <<= 4; } if ((x & 0xC0000000) == 0) { n += 2; x <<= 2; } if ((x & 0x80000000) == 0) { n += 1; } return n; } uint32_t fast_rsqrt(uint32_t x) { if (x == 0) return 0xFFFFFFFF; /* saturate */ if (x == 1) return (1U << SCALE); /* 1.0 in Q16 */ /* exponent = floor(log2(x)) */ int exp = 31 - clz(x); uint32_t y = rsqrt_table[exp]; /* Q16 initial */ /* linear interpolation for better initial guess */ if (x > (1u << exp)) { uint32_t y1 = (exp < 31) ? rsqrt_table[exp + 1] : 0; uint32_t diff = (y > y1) ? (y - y1) : 0; uint32_t frac = (uint32_t)((((uint64_t)x - (1ULL << exp)) << SCALE) >> exp); /* Q16 fraction */ y -= (uint32_t)( (mul32(diff, frac)) >> SCALE ); /* keep Q16 */ } //* Newton iterations (2 times) for (int i = 0; i < 2; i++) { uint64_t y2 = mul32(y, y); /* Q32 */ uint32_t y2_hi = (uint32_t)(y2 >> 16); /* Q16 */ uint64_t xy2 = mul32(x, y2_hi); /* A_Q16 */ uint64_t three_q16 = (uint64_t)3U << SCALE; /* 3 in Q16 */ /* compute B_Q16 = ((3<<16) - xy2) >> 1 (still fits <= 32 bits for valid inputs) */ uint64_t tmp; if (xy2 >= three_q16) tmp = 0; else tmp = (three_q16 - xy2) >> 1; uint32_t B_q16 = (uint32_t)tmp; y = (uint32_t)( mul32(y, B_q16) >> SCALE ); /* Q16 */ } return y; } uint32_t fast_distance_3d(int32_t dx, int32_t dy, int32_t dz) { /* absolute values */ dx = (dx < 0) ? -dx : dx; dy = (dy < 0) ? -dy : dy; dz = (dz < 0) ? -dz : dz; /* sum of squares (use 64-bit) */ uint64_t dist_sq = mul32((uint32_t)dx, (uint32_t)dx) + mul32((uint32_t)dy, (uint32_t)dy) + mul32((uint32_t)dz, (uint32_t)dz); uint32_t scaled_dist_sq; int shift = 0; if (dist_sq > 0xFFFFFFFFULL) { scaled_dist_sq = (uint32_t)(dist_sq >> 16); /* scale down */ shift = 8; /* will multiply result by 2^8 later */ } else { scaled_dist_sq = (uint32_t)dist_sq; } uint32_t inv_sqrt = fast_rsqrt(scaled_dist_sq); /* Q16 */ uint64_t result = mul32(scaled_dist_sq, inv_sqrt) >> SCALE; /* back to Q0 (integer) */ if (shift) result <<= shift; return (result > UINT32_MAX) ? UINT32_MAX : (uint32_t)result; } /* integer sqrt (bitwise) */ static uint32_t my_sqrt(uint32_t x) { if (x == 0) return 0; uint32_t result = 0; uint32_t bit = 1U << 30; while (bit > x) bit >>= 2; while (bit != 0) { if (x >= result + bit) { x -= result + bit; result = (result >> 1) + bit; } else { result >>= 1; } bit >>= 2; } return result; } static int32_t my_abs(int32_t x) { int32_t mask = x >> 31; return (x + mask) ^ mask; } static uint32_t f_abs(uint32_t x, uint32_t y) { return (x >= y) ? (x - y) : (y - x); } /* Simple integer to decimal string conversion */ static void print_dec_no_nl(unsigned long val) { char buf[20]; char *p = buf + sizeof(buf) - 1; *p = '\0'; p--; if (val == 0) { *p = '0'; p--; } else { while (val > 0) { *p = '0' + (char)umod(val, 10); p--; val = udiv(val, 10); } } p++; printstr(p, (size_t)(buf + sizeof(buf) - p - 1)); /* exclude terminating '\0' */ } /* print decimal with fixed width (pad with '0') */ static void print_dec_pad(unsigned long val, int width) { char buf[32]; for (int i = width - 1; i >= 0; --i) { buf[i] = '0' + (char)umod(val, 10); val = udiv(val, 10); } printstr(buf, (size_t)width); } /* keep original print_dec as newline-terminated convenience */ static void print_dec(unsigned long val) { print_dec_no_nl(val); TEST_LOGGER("\n"); } void *memcpy(void *dest, const void *src, size_t n) { uint8_t *d = (uint8_t *) dest; const uint8_t *s = (const uint8_t *) src; while (n--) *d++ = *s++; return dest; } static uint32_t umul(uint32_t a, uint32_t b) { uint32_t result = 0; while (b) { if (b & 1) result += a; a <<= 1; b >>= 1; } return result; } static unsigned long umod(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; } } return remainder; } /* Software division for RV32I (no M extension) */ static unsigned long udiv(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long quotient = 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; quotient |= (1UL << i); } } return quotient; } /* 64-bit divide by 32-bit (shift-subtract) -> returns 64-bit quotient */ static uint64_t udiv64(uint64_t dividend, uint32_t divisor) { if (divisor == 0) return (uint64_t)-1; uint64_t quotient = 0; uint64_t rem = 0; for (int i = 63; i >= 0; i--) { rem = (rem << 1) | ((dividend >> i) & 1ULL); if (rem >= divisor) { rem -= divisor; quotient |= (1ULL << i); } } return quotient; } int main(void) { uint64_t start_cycles, end_cycles, cycles_elapsed; uint64_t start_instret, end_instret, instret_elapsed; TEST_LOGGER("=== Part 1: Reciprocal Square Root Comparison ===\n\n"); int testcase[3] = {4, 2000, 50000}; for (int i = 0; i < 3; i++) { start_cycles = get_cycles(); start_instret = get_instret(); uint32_t x = (uint32_t)testcase[i]; uint32_t y_fixed = fast_rsqrt(x); /* Q16 */ end_cycles = get_cycles(); end_instret = get_instret(); cycles_elapsed = end_cycles - start_cycles; instret_elapsed = end_instret - start_instret; /* format fast (Q16) into int.frac(4 digits) using umul */ uint32_t y_int = y_fixed >> SCALE; uint32_t mask = (1u << SCALE) - 1; uint64_t yprod = mul32((uint32_t)(y_fixed & mask), 10000); /* 64-bit product */ uint32_t y_frac4 = (uint32_t)(yprod >> SCALE); // compute expected in Q16: uint32_t sqrt_x_q8 = my_sqrt(x << SCALE); uint32_t expected_fixed = (sqrt_x_q8 == 0) ? 0 : (uint32_t)udiv((1U << (SCALE + 8)), sqrt_x_q8); uint64_t eprod = mul32(expected_fixed & mask, 10000); uint32_t exp_int = expected_fixed >> SCALE; uint32_t exp_frac4 = (uint32_t)(eprod >> SCALE); uint32_t diff_fixed = f_abs(expected_fixed, y_fixed); /* error_milli = (diff_fixed * 100000) / expected_fixed, use mul32 + udiv64 */ uint64_t prod = mul32(diff_fixed, 100000U); uint64_t error_milli = (expected_fixed == 0) ? 0 : udiv64(prod, expected_fixed); uint32_t error_int = (uint32_t)(error_milli / 1000ULL); uint32_t error_frac3 = (uint32_t)(error_milli % 1000ULL); TEST_LOGGER("x = "); print_dec_no_nl(x); TEST_LOGGER(" fast = "); print_dec_no_nl(y_int); TEST_LOGGER("."); print_dec_pad(y_frac4, 4); TEST_LOGGER(" expected = "); print_dec_no_nl(exp_int); TEST_LOGGER("."); print_dec_pad(exp_frac4, 4); TEST_LOGGER(" error = "); print_dec_no_nl(error_int); TEST_LOGGER("."); print_dec_pad(error_frac3, 3); TEST_LOGGER("%\n"); TEST_LOGGER(" Cycles: "); print_dec((unsigned long)cycles_elapsed); TEST_LOGGER(" Instructions: "); print_dec((unsigned long)instret_elapsed); TEST_LOGGER("\n"); } TEST_LOGGER("\n=== Part 2: 3D Distance Approximation (scaled correctly) ===\n\n"); int dxs[3] = {3, 10, 100}; int dys[3] = {4, 20, 200}; int dzs[3] = {12, 30, 300}; for (int i = 0; i < 3; i++) { start_cycles = get_cycles(); start_instret = get_instret(); uint32_t fast = fast_distance_3d(dxs[i], dys[i], dzs[i]); end_cycles = get_cycles(); end_instret = get_instret(); cycles_elapsed = end_cycles - start_cycles; instret_elapsed = end_instret - start_instret; /* sum_squares using umul */ uint32_t s0 = umul((uint32_t)dxs[i], (uint32_t)dxs[i]); uint32_t s1 = umul((uint32_t)dys[i], (uint32_t)dys[i]); uint32_t s2 = umul((uint32_t)dzs[i], (uint32_t)dzs[i]); uint32_t sum_squares = s0 + s1 + s2; uint32_t real = my_sqrt(sum_squares); uint32_t diff = (fast > real) ? (fast - real) : (real - fast); /* error = (diff * 100) / real -> use umul + udiv */ uint32_t pct_num = umul(diff, 100U); uint32_t error = (real == 0) ? 0 : (uint32_t)udiv(pct_num, real); TEST_LOGGER("x = "); TEST_LOGGER("("); print_dec_no_nl((unsigned long)dxs[i]); TEST_LOGGER(","); print_dec_no_nl((unsigned long)dys[i]); TEST_LOGGER(","); print_dec_no_nl((unsigned long)dzs[i]); TEST_LOGGER(") fast = "); print_dec_no_nl(fast); TEST_LOGGER(" expected = "); print_dec_no_nl(real); TEST_LOGGER(" error = "); print_dec_no_nl(error); TEST_LOGGER("%\n"); TEST_LOGGER(" Cycles: "); print_dec((unsigned long)cycles_elapsed); TEST_LOGGER(" Instructions: "); print_dec((unsigned long)instret_elapsed); TEST_LOGGER("\n"); } return 0; } ```