# Assignment2: Complete Applications contributed by < [Wei-Chen Lai](https://github.com/Winstonllllai) > > [GitHub](https://github.com/Winstonllllai/ca2025-HW2) ## Set Up Environment On MacOS 1. Install RISC-V GNU Toolchain open terminal and execute instructions ```bash wget https://github.com/xpack-dev-tools/riscv-none-elf-gcc-xpack/releases/download/v15.2.0-1/xpack-riscv-none-elf-gcc-15.2.0-1-darwin-arm64.tar.gz tar zxvf xpack-riscv-none-elf-gcc-15.2.0-1-darwin-arm64.tar.gz cp -af xpack-riscv-none-elf-gcc-15.2.0-1 $HOME/riscv-none-elf-gcc cd $HOME/riscv-none-elf-gcc echo "export PATH=`pwd`/bin:\$PATH" > setenv source $HOME/riscv-none-elf-gcc/setenv ``` validate the installation is successful ```bash riscv-none-embed-gcc -v ``` output: ``` Using built-in specs. COLLECT_GCC=riscv-none-elf-gcc COLLECT_LTO_WRAPPER=/Users/winston/riscv-none-elf-gcc/bin/../libexec/gcc/riscv-none-elf/14.2.0/lto-wrapper Target: riscv-none-elf Configured with: /Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/sources/gcc-14.2.0/configure --prefix=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/application --with-sysroot=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/application/riscv-none-elf --with-native-system-header-dir=/include --infodir=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/aarch64-apple-darwin20.6.0/install/share/info --mandir=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/aarch64-apple-darwin20.6.0/install/share/man --htmldir=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/aarch64-apple-darwin20.6.0/install/share/html --pdfdir=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/aarch64-apple-darwin20.6.0/install/share/pdf --build=aarch64-apple-darwin20.6.0 --host=aarch64-apple-darwin20.6.0 --target=riscv-none-elf --disable-nls --disable-shared --disable-threads --disable-tls --enable-checking=release --enable-languages=c,c++ --with-gmp=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/aarch64-apple-darwin20.6.0/install --with-newlib --with-pkgversion='xPack GNU RISC-V Embedded GCC arm64' --with-gnu-as --with-gnu-ld --with-system-zlib --with-abi=ilp32 --with-arch=rv32imac --enable-multilib Thread model: single Supported LTO compression algorithms: zlib zstd gcc version 14.2.0 (xPack GNU RISC-V Embedded GCC arm64) ``` 2. Install Dependency ```bash brew install sdl2 sdl2_mixer ``` 3. Build rv32emu ```bash= git clone https://github.com/sysprog21/rv32emu cd rv32emu make ``` 4. Make Test test result: ``` Linux image is found. Skipping downloading. Running hello.elf ... [OK] Fetching SHA-1 of prebuilt binaries ... skipped Checking SHA-1 of prebuilt binaries ... [OK] Running puzzle ... [OK] Running fcalc ... [OK] Running pi ... [OK] ``` ## Run Program on Bare Metal ### Print on Terminal In the previous assignment (arch2025-homework1), we used the [Ripes](https://github.com/mortbopet/Ripes) simulator for RV32I instruction simulation. To print any information to the terminal, we made direct ecalls. Here is a code segment from HW1 Problem B as an example: ``` 250: produced value 0xcfff0, re-encoded to 250 ``` The instructions are as follows: ```asm= mv a0, t4 # a0 = fl li a7, 1 # syscall: print integer ecall la a0, str1 # Load address of str1 li a7, 4 # syscall: print string ecall mv a0, t5 # a0 = value li a7, 34 # syscall: print hex integer ecall la a0, str2 # Load address of str2 li a7, 4 # syscall: print string ecall mv a0, t6 # a0 = fl2 li a7, 1 # syscall: print integer ecall la a0, str6 # Load address of str6 li a7, 4 # syscall: print string ecall # syscall: print string .data str1: .string ": produced value " str2: .string ", re-encoded to " str6: .string "\n" ``` However, when doing bare-metal development with [rv32emu](https://github.com/sysprog21/rv32emu), the system call mechanism is different from that of Ripes. Therefore, this functionality was encapsulated into print.c, allowing it to be invoked as a standard function. #### Code ::: spoiler **print.c** (Click to unfold) ```c /* Bare metal memcpy implementation */ #include <stdint.h> #include <stdlib.h> #include <stdbool.h> #include "print.h" void *memcpy(void *dest, const void *src, size_t n) { uint8_t *d = (uint8_t *) dest; const uint8_t *s = (const uint8_t *) src; while (n--) *d++ = *s++; return dest; } /* Software division for RV32I (no M extension) */ unsigned long udiv(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long quotient = 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; quotient |= (1UL << i); } } return quotient; } unsigned long umod(unsigned long dividend, unsigned long divisor) { if (divisor == 0) return 0; unsigned long remainder = 0; for (int i = 31; i >= 0; i--) { remainder <<= 1; remainder |= (dividend >> i) & 1; if (remainder >= divisor) { remainder -= divisor; } } return remainder; } /* Software multiplication for RV32I (no M extension) */ uint32_t umul(uint32_t a, uint32_t b) { uint32_t result = 0; while (b) { if (b & 1) result += a; a <<= 1; b >>= 1; } return result; } /* Provide __mulsi3 for GCC */ uint32_t __mulsi3(uint32_t a, uint32_t b) { return umul(a, b); } /* Simple integer to hex string conversion */ void print_hex(unsigned long val) { char buf[20]; char *p = buf + sizeof(buf) - 1; if (val == 0) { *p = '0'; p--; } else { while (val > 0) { int digit = val & 0xf; *p = (digit < 10) ? ('0' + digit) : ('a' + digit - 10); p--; val >>= 4; } } p++; printstr(p, (buf + sizeof(buf) - p)); } /* Simple integer to 2-digit hex string conversion (zero-padded) */ void print_hex_byte(uint8_t val) { char buf[2]; int digit1 = (val >> 4) & 0xf; int digit2 = val & 0xf; buf[0] = (digit1 < 10) ? ('0' + digit1) : ('a' + digit1 - 10); buf[1] = (digit2 < 10) ? ('0' + digit2) : ('a' + digit2 - 10); printstr(buf, 2); } /* Simple integer to decimal string conversion */ void print_dec(unsigned long val) { char buf[20]; char *p = buf + sizeof(buf) - 1; if (val == 0) { *p = '0'; p--; } else { while (val > 0) { *p = '0' + umod(val, 10); p--; val = udiv(val, 10); } } p++; printstr(p, (buf + sizeof(buf) - p)); } ``` ::: :::spoiler **print.h** (Click to unfold) ```c # ifndef PRINT_H # define PRINT_H #include <stdint.h> #include <stdlib.h> #include <stdbool.h> #define printstr(ptr, length) \ do { \ asm volatile( \ "add a7, x0, 0x40;" \ "add a0, x0, 0x1;" /* stdout */ \ "add a1, x0, %0;" \ "mv a2, %1;" /* length character */ \ "ecall;" \ : \ : "r"(ptr), "r"(length) \ : "a0", "a1", "a2", "a7"); \ } while (0) #define TEST_OUTPUT(msg, length) printstr(msg, length) #define TEST_LOGGER(msg) \ { \ char _msg[] = msg; \ TEST_OUTPUT(_msg, sizeof(_msg) - 1); \ } extern uint64_t get_cycles(void); extern uint64_t get_instret(void); void *memcpy(void *dest, const void *src, size_t n); unsigned long udiv(unsigned long dividend, unsigned long divisor); unsigned long umod(unsigned long dividend, unsigned long divisor); uint32_t umul(uint32_t a, uint32_t b); uint32_t __mulsi3(uint32_t a, uint32_t b); void print_hex(unsigned long val); void print_hex_byte(uint8_t val); void print_dec(unsigned long val); #endif ``` ::: ### Linker Script - Purpose: Draws the "memory map." - It tells the linker where in memory to place the code (.text) and data (.data) (e.g., at 0x10000). - It defines symbols the C environment needs, most importantly, where the top of the stack is (__stack_top). ``` High Address ⬆︎ +-----------------------+ | .stack | <-- Stack Section (4KB) | (...Free Space...) | +-----------------------+ <-- __stack_top (Symbol) | (Free Space...) | | | +-----------------------+ <-- __bss_end (Symbol) | .bss | <-- Uninitialized Globals +-----------------------+ <-- __bss_start (Symbol) | .data | <-- Initialized Data | (str1: "Move Disk ") | | (obdata: 0x3c, 0x3b) | +-----------------------+ | .text | <-- Code Section | (print_dec func...) | | (main func...) | | (_start func...) | 0x10000+-----------------------+ <-- Program Start Address | (Reserved) | 0x00000+-----------------------+ Low Address ⬇︎ ``` ```linker= OUTPUT_ARCH("riscv"); ENTRY(_start); SECTIONS { . = 0x10000; .text : { *(.text._start) *(.text) } .data : { *(.data) } .bss : { __bss_start = .; *(.bss) __bss_end = .; } .stack (NOLOAD) : { . = ALIGN(16); __stack_start = .; . += 4096; __stack_top = .; } } ``` ### Makefile - Purpose: To automate the build process. - It compiles all your .c files and .S files into .o object files. Then, it tells the linker (ld) to use the linker.ld's rules ("the map") to "glue" all the parts (like start.o and your code) together into a single test.elf executable. ```makefile= CROSS_COMPILE = riscv-none-elf- include ../../../mk/toolchain.mk ARCH = -march=rv32izicsr LINKER_SCRIPT = linker.ld EMU ?= ../../../build/rv32emu AFLAGS = -g $(ARCH) CFLAGS = -g -march=rv32i_zicsr LDFLAGS = -T $(LINKER_SCRIPT) EXEC = test.elf CC = $(CROSS_COMPILE)gcc AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld OBJDUMP = $(CROSS_COMPILE)objdump OBJS = start.o uf8_optim.o print.o .PHONY: all run dump clean all: $(EXEC) $(EXEC): $(OBJS) $(LINKER_SCRIPT) $(LD) $(LDFLAGS) -o $@ $(OBJS) %.o: %.S $(AS) $(AFLAGS) $< -o $@ %.o: %.c $(CC) $(CFLAGS) $< -o $@ -c run: $(EXEC) @test -f $(EMU) || (echo "Error: $(EMU) not found" && exit 1) @grep -q "ENABLE_ELF_LOADER=1" ../../../build/.config || (echo "Error: ENABLE_ELF_LOADER=1 not set" && exit 1) @grep -q "ENABLE_SYSTEM=1" ../../../build/.config || (echo "Error: ENABLE_SYSTEM=1 not set" && exit 1) $(EMU) $< dump: $(EXEC) $(OBJDUMP) -Ds $< | less clean: rm -f $(EXEC) $(OBJS)v ``` :::info Defining CROSS_COMPILE = riscv-none-elf- is necessary to override the toolchain.mk script's auto-detection. Without it, the script may incorrectly select a 64-bit toolchain, which conflicts with this 32-bit project (-march=rv32izicsr). Setting this variable before the include forces the build to use the correct 32-bit compiler (CC), assembler (AS), and linker (LD), preventing a 32/64-bit incompatibility error. ::: ### Start up Code - Purpose: Does the "prep work" before main. - Sets the Stack Pointer (sp): This is required for C function calls to work. It runs la sp, __stack_top, using the symbol from the linker script. - Prepares the C environment: Clears the BSS section (zeroing out uninitialized global variables). - Calls main: After setup, it finally executes call main to run your C code. ```asm= .section .text._start .globl _start .type _start, @function _start: la sp, __stack_top la t0, __bss_start la t1, __bss_end clear_bss: bge t0, t1, bss_cleared sw zero, 0(t0) addi t0, t0, 4 j clear_bss bss_cleared: call main li a7, 93 li a0, 0 ecall infinite_loop: j infinite_loop .size _start, .-_start .weak __bss_start .weak __bss_end .weak __stack_top ``` ### Analysis #### Program size To display section size, execute instruction below: ``` riscv-none-elf-size build/hello.elf ``` Expected output: ``` text data bss dec hex filename 80 0 0 80 50 build/hello.elf ``` #### Performance 1. **CPU Execution Time** The total time a program takes to run is a function of the total CPU cycles it consumes and the CPU's clock speed. $$\text{Execution Time} = \frac{\text{Total Cycles}}{\text{Clock Rate}}$$ * **Total Cycles**: The total number of CPU cycles the program took to complete. * **Clock Rate**: How many cycles the CPU can execute per second (e.g., 3 GHz). 2. **Performance Decomposition** The "Total Cycles" from the first formula can be broken down further, giving the most famous and important performance equation: $$\text{Execution Time} = \frac{\text{Instructions}}{\text{Program}} \times \frac{\text{Cycles}}{\text{Instruction}} \times \frac{\text{Time}}{\text{Cycle}}$$ We can simplify these three terms into: $$\text{Execution Time} = \text{Instruction Count} \times \text{CPI} \times \text{Clock Cycle Time}$$ * **Instruction Count (IC)**: The total number of instructions executed by the program. * **CPI (Cycles Per Instruction)**: How many CPU cycles it takes to execute **one instruction on average**. This is a key metric for CPU architecture efficiency. * **Clock Cycle Time**: The time required for one CPU cycle (e.g., for a 3 GHz Clock Rate, the Clock Cycle Time is 1/3G seconds). 3. **IPC (Instructions Per Cycle)** IPC is the inverse of CPI. It measures how many instructions are executed in **one CPU cycle on average**. $$\text{IPC} = \frac{1}{\text{CPI}}$$ * **IPC > 1**: Represents a "Superscalar" architecture that can complete more than one instruction per cycle on average. * **IPC < 1**: Represents an architecture where one instruction takes more than one cycle on average. **```getcycles.S```**: - Provides the get_cycles function. - Used to read the 64-bit CPU cycle counter (cycleh and cycle). - It reads the high bits, then the low bits, then the high bits again. If the two high-bit reads do not match, it retries. ```asm= .text .globl get_cycles .align 2 get_cycles: csrr a1, cycleh csrr a0, cycle csrr a2, cycleh bne a1, a2, get_cycles ret .size get_cycles,.-get_cycles ``` **```getinstret.S```**: - Provides the get_instret function. - Used to read the 64-bit instruction-retired counter (instreth and instret). - It uses the exact same "high-low-high" retry logic as getcycles.S. ```asm= .text .globl get_instret .align 2 get_instret: csrr a1, instreth csrr a0, instret csrr a2, instreth bne a1, a2, get_instret ret .size get_instret,.-get_instret ``` :::info By default, rv32emu is a "Functional Simulator," meaning its goal is correctness (ensuring every instruction executes correctly) rather than "Timing Simulation" (simulating the precise time each instruction takes). As we can see, in ```src/emulate.c```, both ```CSR_CYCLE``` and ```CSR_INSTRET``` points to ```rv->csr_cycle```. So ==**CPI will always be 1**==. ```c // rv32emu/src/emulate.c case CSR_CYCLE: /* Cycle counter for RDCYCLE instruction */ return (uint32_t *) &rv->csr_cycle; case CSR_CYCLEH: /* Upper 32 bits of cycle */ return &((uint32_t *) &rv->csr_cycle)[1]; ... case CSR_INSTRET: /* Number of Instructions Retired Counter */ return (uint32_t *) (&rv->csr_cycle); case CSR_INSTRETH: /* Upper 32 bits of instructions retired */ return &((uint32_t *) &rv->csr_cycle)[1]; ``` ::: ## HW1 Problem B The code is adapted from the previous assignment [(arch2025-homework1-Problem B)](https://hackmd.io/0mzMhln2To2AGMOu9rz3Pg?both#Problem-B). ### uf8 `uf8` is a specialized 8-bit unsigned numerical representation designed for data compression. This scheme enables the storage of a dynamic range far exceeding that of a standard `uint8_t` (0-255) within a single byte (8 bits). Its core principle involves a non-linear quantization strategy that allocates the limited bits between an exponent and a mantissa, achieving a balance between numerical dynamic range and resolution precision. * **Bit Layout** ``` ┌──────────────┬──────────────┐ │ Exponent (4) │ Mantissa (4) │ └──────────────┴──────────────┘ E: Exponent bits (4 bits) M: Mantissa/fraction bits (4 bits) ``` * **Decoding** $D(b) = m \cdot 2^e + (2^e - 1) \cdot 16$ where $e = \lfloor b/16 \rfloor$ and $m = b \bmod 16$ * **Encoding** $E(v) = \begin{cases} v, & \text{if } v < 16 \\ 16e + \lfloor(v - \text{offset}(e))/2^e\rfloor, & \text{otherwise} \end{cases}$ Where $\text{offset}(e) = (2^e - 1) \cdot 16$ ### `clz` optimization It is a processor instruction that counts the number of consecutive zero bits in a binary number, starting from the most significant bit (the left side) until the first 1 is found. Its main purpose is to quickly determine a number's magnitude or to normalize it for floating-point operations. The clz function was optimized using loop unrolling. The original iterative loop was replaced with a linear sequence of instructions that explicitly performs each step. This improves performance by eliminating loop control overhead and branch instructions, at the cost of a slight increase in code size. * **Original** ```assembly= clz: # Input: a0 = 32-bit unsigned integer. # Output: a0 = number of leading zeros in x's binary representation li t0, 32 # n = t0 = 32 li t1, 16 # c = t1 = 16 clz.loop: srl t2, a0, t1 # y = t2 = x >> c beq t2, zero, clz.skip # if (y == 0) goto clz.skip sub t0, t0, t1 # n -= c mv a0, t2 # x = y clz.skip: srli t1, t1, 1 bne t1, zero, clz.loop # while (c != 0) goto clz.loop sub a0, t0, a0 # return n - x ret # End of clz function ``` * **Unroll loop** ```assembly= clz: # Input: a0 = 32-bit unsigned integer. # Output: a0 = number of leading zeros in x's binary representation li t0, 32 # n = t0 = 32 srli t2, a0, 16 # y = t2 = x >> 16 beq t2, zero, clz.L_c8 # if (y == 0) goto clz.L_c8 addi t0, t0, -16 # n -= 16 mv a0, t2 # x = y clz.L_c8: srli t2, a0, 8 # y = t2 = x >> 8 beq t2, zero, clz.L_c4 # if (y == 0) goto clz.L_c4 addi t0, t0, -8 # n -= 8 mv a0, t2 # x = y clz.L_c4: srli t2, a0, 4 # y = t2 = x >> 4 beq t2, zero, clz.L_c2 # if (y == 0) goto clz.L_c2 addi t0, t0, -4 # n -= 4 mv a0, t2 # x = y clz.L_c2: srli t2, a0, 2 # y = t2 = x >> 2 beq t2, zero, clz.L_c1 # if (y == 0) goto .L_c1 addi t0, t0, -2 # n -= 2 mv a0, t2 # x = y clz.L_c1: srli t2, a0, 1 # y = t2 = x >> 1 beq t2, zero, clz.L_final # if (y == 0) goto clz.L_final addi t0, t0, -1 # n -= 1 mv a0, t2 # x = y clz.L_final: sub a0, t0, a0 # return n - x ret ``` ### Code :::spoiler **Modification** (Click to unfold) ```diff= @@ -13,6 +13,7 @@ .text +.globl main # ====================================== # Function: main # ====================================== @@ -25,13 +26,14 @@ lw ra, 0(sp) # Restore return address addi sp, sp, 4 # Deallocate stack space beq a0, zero, main.end # if (a0 == 0) goto main.end - la a0, str5 # Load address of str5 - li a7, 4 # syscall: print string + li a7, 0x40 # syscall: write + li a0, 1 # stdout + la a1, str5 # Load address of str5 + li a2, 18 # length = 18 ecall main.end: - li a7, 10 # exit code = 10 - ecall + ret # ====================================== # Function: clz @@ -137,70 +139,63 @@ test: # Input: void # Output: a0 = boolean (1 = pass, 0 = fail) - li t0, -1 # previous_value = -1 - li t1, 1 # t1 = passed = 1 - li t2, 0 # i = 0 - li t3, 256 # max = 256 + addi sp, sp, -20 # Allocate stack space + sw ra, 0(sp) # Save return address + sw s0, 4(sp) # Save previous_value + sw s1, 8(sp) # Save passed + sw s2, 12(sp) # Save i + sw s3, 16(sp) # Save max + li s0, -1 # previous_value = -1 + li s1, 1 # s1 = passed = 1 + li s2, 0 # s2 = i = 0 + li s3, 256 # s3 = max = 256 test.loop: - bge t2, t3, test.end # if (i >= max) goto end - mv t4, t2 # fl = t4 = i - mv a0, t4 # a0 = fl - addi sp, sp, -24 # Allocate stack space - sw ra, 0(sp) # Save return address - sw t0, 4(sp) # Save previous_value - sw t1, 8(sp) # Save passed - sw t2, 12(sp) # Save i - sw t3, 16(sp) # Save max - sw t4, 20(sp) # Save fl + bge s2, s3, test.end # if (i >= max) goto end + mv a0, s2 # a0 = fl jal uf8_decode # a0 = uf8_decode(fl) mv t5, a0 # value = t5 = uf8_decode(fl) jal uf8_encode # a0 = uf8_encode(value) mv t6, a0 # fl2 = t6 = uf8_encode(value) - lw t4, 20(sp) # Restore fl - lw t3, 16(sp) # Restore max - lw t2, 12(sp) # Restore i - lw t1, 8(sp) # Restore passed - lw t0, 4(sp) # Restore previous_value - lw ra, 0(sp) # Restore return address - addi sp, sp, 24 # Deallocate stack space + mv t4, s2 # fl = t4 = i beq t4, t6, test.skip1 # if (fl == fl2) goto skip1 mv a0, t4 # a0 = fl - li a7, 34 # syscall: print integer - ecall - la a0, str1 # Load address of str1 - li a7, 4 # syscall: print string + jal print_dec # print fl + li a7, 0x40 # syscall: write + li a0, 1 # stdout + la a1, str1 # Load address of str1 + li a2, 17 # length = 17 ecall mv a0, t5 # a0 = value - li a7, 1 # syscall: print integer - ecall - la a0, str2 # Load address of str2 - li a7, 4 # syscall: print string + jal print_dec # print value + li a7, 0x40 # syscall: write + li a0, 1 # stdout + la a1, str2 # Load address of str2 + li a2, 21 # length = 21 ecall mv a0, t6 # a0 = fl2 - li a7, 34 # syscall: print integer - ecall - la a0, str6 # Load address of str6 - li a7, 4 # syscall: print string + jal print_dec # print fl2 + li a7, 0x40 # syscall: write + li a0, 1 # stdout + la a1, str6 # Load address of str6 + li a2, 1 # length = 1 ecall - li t1, 0 # passed = 0 + li s1, 0 # passed = 0 test.skip1: - blt t0, t5, test.skip2 # if (previous_value < value) goto skip2 + blt s0, t5, test.skip2 # if (previous_value < value) goto skip2 mv a0, t4 # a0 = fl - li a7, 34 # syscall: print integer - ecall - la a0, str3 # Load address of str3 - li a7, 4 # syscall: print string + jal print_dec # print fl + li a7, 0x40 # syscall: write + li a0, 1 # stdout + la a1, str3 # Load address of str3 + li a2, 8 # length = 8 ecall mv a0, t5 # a0 = value - li a7, 1 # syscall: print integer - ecall - la a0, str4 # Load address of str4 - li a7, 4 # syscall: print string + jal print_dec # print value + li a7, 0x40 # syscall: write + li a0, 1 # stdout + la a1, str4 # Load address of str4 + li a2, 19 # length = 19 ecall - mv a0, t0 # a0 = previous_value - li a7, 1 # syscall: print integer - ecall - la a0, str6 # Load address of str6 - li a7, 4 # syscall: print string + mv a0, s0 # a0 = previous_value + jal print_dec # print previous_value + li a7, 0x40 # syscall: write + li a0, 1 # stdout + la a1, str6 # Load address of str6 + li a2, 1 # length = 1 ecall - li t1, 0 # passed = 0 + li s1, 0 # passed = 0 + mv a0, s1 # return passed test.skip2: - mv t0, t5 # previous_value = value - addi t2, t2, 1 # i++ + mv s0, t5 # previous_value = value + addi s2, s2, 1 # i++ j test.loop test.end: - mv a0, t1 # return passed + lw s3, 16(sp) # Restore max + lw s2, 12(sp) # Restore i + lw s1, 8(sp) # Restore passed + lw s0, 4(sp) # Restore previous_value + lw ra, 0(sp) # Restore return address + addi sp, sp, 20 # Deallocate stack space ret # End of test function ``` ::: #### Output ``` 18:04:40 INFO src/riscv.c:552: Log level: TRACE 18:04:40 INFO src/riscv.c:565: test.elf ELF loaded 18:04:40 INFO src/main.c:315: RISC-V emulator is created and ready to run All tests passed. 18:04:40 INFO src/main.c:338: RISC-V emulator is destroyed ``` ### Analysis #### Program size | Section | C code | Original Assembly | Optimized Assembly | | ------- | ------ | ----------------- | ------------------ | | text | 3300 | 2204 | 2256 | | data | 0 | 122 | 122 | | bss | 4108 | 4106 | 4102 | | total | 7408 | 6432 | 6480 | #### Performance Clock cycles / Instruction count: | C code | Original Assembly | Optimized Assembly | | ------ | ----------------- | ------------------ | | 71326 | 30290 | 27650 | ## Quiz2 Problem A ### Hanoi Tower The Tower of Hanoi is a famous mathematical game and a classic example of recursive algorithms. It consists of three rods and several disks of different sizes. The objective of the game is to move all the disks from one rod to another, following these rules: only one disk can be moved at a time, and no larger disk may be placed on top of a smaller one. ![image](https://hackmd.io/_uploads/ryY6KVcy-e.png) ### Algorithm: Gray Code - The core of the program is non-recursive, utilizing an iterative solution to the 3-disk Tower of Hanoi problem. This design is critical for assembly implementation as it eliminates the overhead of recursive function calls and prevents potential stack overflow. The program runs a game_loop from 1 to 7. In each iteration, it uses a Gray Code algorithm to determine which disk to move. By relying on Gray Codes, complex recursive logic transforms into fast bitwise operations: 1. Calculate Current Gray(n): xor x6, x8, x5 (where n is in x8). 2. Calculate Previous Gray(n-1): xor x7, x7, x28. 3. Find Changed Bit: xor x5, x6, x7. The single set bit (value 1) in x5 indicates which disk number is to be moved. 4. Determine Disk: The code then uses andi and bne to check the 0th, 1st, and 2nd bits of x5 to set x9 to the correct disk number (0, 1, or 2). - Calculating the Target Peg - Smallest Disk (Disk 0): Always moves according to the pattern (current_position + 2) % 3. - Larger Disks (Disk 1 or 2): Use the "sum of pegs" technique (0+1+2=3). The target peg B is calculated as 3 - source_peg A - other_disk_peg C. ### Code #### RV32I Assembly :::spoiler **Assembly code** (Click to unfold) ```asm= .text .globl main main: addi x2, x2, -36 sw x8, 0(x2) sw x9, 4(x2) sw x18, 8(x2) sw x19, 12(x2) sw x20, 16(x2) li x5, 0x15 sw x5, 20(x2) sw x5, 24(x2) sw x5, 28(x2) sw ra, 32(x2) # Fix disk positions (BLANK 1-3: neutralize x5 effect) # BLANK 1: Fix position at x2+20 sw x0, 20(x2) # BLANK 2: Fix position at x2+24 sw x0, 24(x2) # BLANK 3: Fix position at x2+28 sw x0, 28(x2) addi x8, x0, 1 game_loop: # BLANK 4: Check loop termination (2^3 moves) addi x5, x0, 8 beq x8, x5, finish_game # Gray code formula: gray(n) = n XOR (n >> k) # BLANK 5: What is k for Gray code? srli x5, x8, 1 # BLANK 6: Complete Gray(n) calculation xor x6, x8, x5 # BLANK 7-8: Calculate previous value and its shift addi x7, x8, -1 srli x28, x7, 1 # BLANK 9: Generate Gray(n-1) xor x7, x7, x28 # BLANK 10: Which bits changed? xor x5, x6, x7 # Initialize disk number addi x9, x0, 0 # BLANK 11: Mask for testing LSB andi x6, x5, 1 # BLANK 12: Branch if disk 0 moves bne x6, x0, disk_found # BLANK 13: Set disk 1 addi x9, x0, 1 # BLANK 14: Test second bit with proper mask andi x6, x5, 2 bne x6, x0, disk_found # BLANK 15: Last disk number addi x9, x0, 2 disk_found: # BLANK 16: Check impossible pattern (multiple bits) andi x30, x5, 5 addi x31, x0, 5 beq x30, x31, pattern_match jal x0, continue_move pattern_match: continue_move: # BLANK 17: Word-align disk index (multiply by what?) slli x5, x9, 2 # BLANK 18: Base offset for disk array addi x5, x5, 20 add x5, x2, x5 lw x18, 0(x5) bne x9, x0, handle_large # BLANK 19: Small disk moves by how many positions? addi x19, x18, 2 # BLANK 20: Number of pegs addi x6, x0, 3 blt x19, x6, display_move sub x19, x19, x6 jal x0, display_move handle_large: # BLANK 21: Load reference disk position lw x6, 20(x2) # BLANK 22: Sum of all peg indices (0+1+2) addi x19, x0, 3 sub x19, x19, x18 sub x19, x19, x6 display_move: la x20, obdata add x5, x20, x18 lbu x11, 0(x5) li x6, 0x6F xor x11, x11, x6 addi x11, x11, -0x12 add x7, x20, x19 lbu x12, 0(x7) xor x12, x12, x6 addi x12, x12, -0x12 mv t1, x11 mv t2, x12 li a7, 0x40 li a0, 1 la a1, str1 li a2, 10 ecall addi x10, x9, 1 jal print_dec li a0, 1 la a1, str2 li a2, 6 ecall la t0, char_buffer sb t1, 0(t0) li a0, 1 mv a1, t0 li a2, 1 ecall li a0, 1 la a1, str3 li a2, 4 ecall la t0, char_buffer sb t2, 0(t0) li a0, 1 mv a1, t0 li a2, 1 ecall li a0, 1 la a1, str4 li a2, 1 ecall # BLANK 26: Calculate storage offset slli x5, x9, 2 addi x5, x5, 20 add x5, x2, x5 # BLANK 27: Update disk position sw x19, 0(x5) # BLANK 28-29: Increment counter and loop addi x8, x8, 1 jal x0, game_loop finish_game: lw x8, 0(x2) lw x9, 4(x2) lw x18, 8(x2) lw x19, 12(x2) lw x20, 16(x2) lw ra, 32(x2) addi x2, x2, 36 ret .data obdata: .byte 0x3c, 0x3b, 0x3a str1: .asciz "Move Disk " str2: .asciz " from " str3: .asciz " to " str4: .asciz "\n" char_buffer: .space 1 ``` ::: ### Optimization ```Problem_A.S (Original)``` contains a game_loop: structure that iterates 7 times (for counter x8 from 1 to 7). It uses a beq to check for termination (x8 == 8) and a jal to jump back to the start. ```Problem_A_optim.S (Optimized)``` completely removes this loop. It implements loop unrolling by pasting the entire loop body 7 times in a row, allowing the code to execute linearly without any branching or jump instructions. :::spoiler **Modification** (Click to unfold) ```diff= --- a/Problem_A.S +++ b/Problem_A_optim.S @@ -20,31 +20,13 @@ sw ra, 32(x2) # Fix disk positions (BLANK 1-3: neutralize x5 effect) - # BLANK 1: Fix position at x2+20 sw x0, 20(x2) - # BLANK 2: Fix position at x2+24 sw x0, 24(x2) - # BLANK 3: Fix position at x2+28 sw x0, 28(x2) addi x8, x0, 1 -game_loop: - # BLANK 4: Check loop termination (2^3 moves) - addi x5, x0, 8 - beq x8, x5, finish_game - - # Gray code formula: gray(n) = n XOR (n >> k) - # BLANK 5: What is k for Gray code? srli x5, x8, 1 - # BLANK 6: Complete Gray(n) calculation xor x6, x8, x5 - # BLANK 7-8: Calculate previous value and its shift addi x7, x8, -1 srli x28, x7, 1 - # BLANK 9: Generate Gray(n-1) xor x7, x7, x28 - # BLANK 10: Which bits changed? xor x5, x6, x7 # Initialize disk number addi x9, x0, 0 - # BLANK 11: Mask for testing LSB andi x6, x5, 1 - # BLANK 12: Branch if disk 0 moves - bne x6, x0, disk_found - - # BLANK 13: Set disk 1 + bne x6, x0, disk_found_1 + addi x9, x0, 1 - # BLANK 14: Test second bit with proper mask andi x6, x5, 2 - bne x6, x0, disk_found - - # BLANK 15: Last disk number + bne x6, x0, disk_found_1 + addi x9, x0, 2 -disk_found: - # BLANK 16: Check impossible pattern (multiple bits) +disk_found_1: andi x30, x5, 5 addi x31, x0, 5 - beq x30, x31, pattern_match - jal x0, continue_move -pattern_match: -continue_move: - - # BLANK 17: Word-align disk index (multiply by what?) + beq x30, x31, pattern_match_1 + jal x0, continue_move_1 +pattern_match_1: +continue_move_1: + slli x5, x9, 2 - # BLANK 18: Base offset for disk array - addi x5, x5, 20 - add x5, x2, x5 - lw x18, 0(x5) - - bne x9, x0, handle_large - - # BLANK 19: Small disk moves by how many positions? - addi x19, x18, 2 - - # BLANK 20: Number of pegs - addi x6, x0, 3 - blt x19, x6, display_move + addi x5, x5, 20 + add x5, x2, x5 + lw x18, 0(x5) + + bne x9, x0, handle_large_1 + + addi x19, x18, 2 + + addi x6, x0, 3 + blt x19, x6, display_move_1 sub x19, x19, x6 - jal x0, display_move - -handle_large: - # BLANK 21: Load reference disk position + jal x0, display_move_1 + +handle_large_1: lw x6, 20(x2) - # BLANK 22: Sum of all peg indices (0+1+2) addi x19, x0, 3 sub x19, x19, x18 sub x19, x19, x6 -display_move: +display_move_1: la x20, obdata add x5, x20, x18 lbu x11, 0(x5) @@ -141,20 +123,803 @@ li a2, 1 ecall - # BLANK 26: Calculate storage offset - slli x5, x9, 2 - addi x5, x5, 20 - add x5, x2, x5 - - # BLANK 27: Update disk position + slli x5, x9, 2 + addi x5, x5, 20 + add x5, x2, x5 + sw x19, 0(x5) - # BLANK 28-29: Increment counter and loop addi x8, x8, 1 - jal x0, game_loop - -finish_game: + + srli x5, x8, 1 + + xor x6, x8, x5 + + addi x7, x8, -1 + srli x28, x7, 1 + + xor x7, x7, x28 + + xor x5, x6, x7 + + addi x9, x0, 0 + + andi x6, x5, 1 + + bne x6, x0, disk_found_2 + + addi x9, x0, 1 + + andi x6, x5, 2 + bne x6, x0, disk_found_2 + + addi x9, x0, 2 + +disk_found_2: + andi x30, x5, 5 + addi x31, x0, 5 + beq x30, x31, pattern_match_2 + jal x0, continue_move_2 +pattern_match_2: +continue_move_2: + + slli x5, x9, 2 + + addi x5, x5, 20 + add x5, x2, x5 + lw x18, 0(x5) + + bne x9, x0, handle_large_2 + + addi x19, x18, 2 + + addi x6, x0, 3 + blt x19, x6, display_move_2 + sub x19, x19, x6 + jal x0, display_move_2 + +handle_large_2: + lw x6, 20(x2) + + addi x19, x0, 3 + sub x19, x19, x18 + sub x19, x19, x6 + +display_move_2: + la x20, obdata + add x5, x20, x18 + lbu x11, 0(x5) + li x6, 0x6F + xor x11, x11, x6 + addi x11, x11, -0x12 + + add x7, x20, x19 + lbu x12, 0(x7) + xor x12, x12, x6 + addi x12, x12, -0x12 + + mv t1, x11 + mv t2, x12 + li a7, 0x40 + li a0, 1 + la a1, str1 + li a2, 10 + ecall + + addi x10, x9, 1 + jal print_dec + + li a0, 1 + la a1, str2 + li a2, 6 + ecall + + la t0, char_buffer + sb t1, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str3 + li a2, 4 + ecall + + la t0, char_buffer + sb t2, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str4 + li a2, 1 + ecall + + slli x5, x9, 2 + addi x5, x5, 20 + add x5, x2, x5 + + sw x19, 0(x5) + + addi x8, x8, 1 + + srli x5, x8, 1 + + xor x6, x8, x5 + + addi x7, x8, -1 + srli x28, x7, 1 + + xor x7, x7, x28 + + xor x5, x6, x7 + + addi x9, x0, 0 + + andi x6, x5, 1 + + bne x6, x0, disk_found_3 + + addi x9, x0, 1 + + andi x6, x5, 2 + bne x6, x0, disk_found_3 + + addi x9, x0, 2 + +disk_found_3: + andi x30, x5, 5 + addi x31, x0, 5 + beq x30, x31, pattern_match_3 + jal x0, continue_move_3 +pattern_match_3: +continue_move_3: + + slli x5, x9, 2 + + addi x5, x5, 20 + add x5, x2, x5 + lw x18, 0(x5) + + bne x9, x0, handle_large_3 + + addi x19, x18, 2 + + addi x6, x0, 3 + blt x19, x6, display_move_3 + sub x19, x19, x6 + jal x0, display_move_3 + +handle_large_3: + lw x6, 20(x2) + + addi x19, x0, 3 + sub x19, x19, x18 + sub x19, x19, x6 + +display_move_3: + la x20, obdata + add x5, x20, x18 + lbu x11, 0(x5) + li x6, 0x6F + xor x11, x11, x6 + addi x11, x11, -0x12 + + add x7, x20, x19 + lbu x12, 0(x7) + xor x12, x12, x6 + addi x12, x12, -0x12 + + mv t1, x11 + mv t2, x12 + li a7, 0x40 + li a0, 1 + la a1, str1 + li a2, 10 + ecall + + addi x10, x9, 1 + jal print_dec + + li a0, 1 + la a1, str2 + li a2, 6 + ecall + + la t0, char_buffer + sb t1, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str3 + li a2, 4 + ecall + + la t0, char_buffer + sb t2, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str4 + li a2, 1 + ecall + + slli x5, x9, 2 + addi x5, x5, 20 + add x5, x2, x5 + + sw x19, 0(x5) + + addi x8, x8, 1 + + srli x5, x8, 1 + + xor x6, x8, x5 + + addi x7, x8, -1 + srli x28, x7, 1 + + xor x7, x7, x28 + + xor x5, x6, x7 + + addi x9, x0, 0 + + andi x6, x5, 1 + + bne x6, x0, disk_found_4 + + addi x9, x0, 1 + + andi x6, x5, 2 + bne x6, x0, disk_found_4 + + addi x9, x0, 2 + +disk_found_4: + andi x30, x5, 5 + addi x31, x0, 5 + beq x30, x31, pattern_match_4 + jal x0, continue_move_4 +pattern_match_4: +continue_move_4: + + slli x5, x9, 2 + + addi x5, x5, 20 + add x5, x2, x5 + lw x18, 0(x5) + + bne x9, x0, handle_large_4 + + addi x19, x18, 2 + + addi x6, x0, 3 + blt x19, x6, display_move_4 + sub x19, x19, x6 + jal x0, display_move_4 + +handle_large_4: + lw x6, 20(x2) + + addi x19, x0, 3 + sub x19, x19, x18 + sub x19, x19, x6 + +display_move_4: + la x20, obdata + add x5, x20, x18 + lbu x11, 0(x5) + li x6, 0x6F + xor x11, x11, x6 + addi x11, x11, -0x12 + + add x7, x20, x19 + lbu x12, 0(x7) + xor x12, x12, x6 + addi x12, x12, -0x12 + + mv t1, x11 + mv t2, x12 + li a7, 0x40 + li a0, 1 + la a1, str1 + li a2, 10 + ecall + + addi x10, x9, 1 + jal print_dec + + li a0, 1 + la a1, str2 + li a2, 6 + ecall + + la t0, char_buffer + sb t1, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str3 + li a2, 4 + ecall + + la t0, char_buffer + sb t2, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str4 + li a2, 1 + ecall + + slli x5, x9, 2 + addi x5, x5, 20 + add x5, x2, x5 + + sw x19, 0(x5) + + addi x8, x8, 1 + + srli x5, x8, 1 + + xor x6, x8, x5 + + addi x7, x8, -1 + srli x28, x7, 1 + + xor x7, x7, x28 + + xor x5, x6, x7 + + addi x9, x0, 0 + + andi x6, x5, 1 + + bne x6, x0, disk_found_5 + + addi x9, x0, 1 + + andi x6, x5, 2 + bne x6, x0, disk_found_5 + + addi x9, x0, 2 + +disk_found_5: + andi x30, x5, 5 + addi x31, x0, 5 + beq x30, x31, pattern_match_5 + jal x0, continue_move_5 +pattern_match_5: +continue_move_5: + + slli x5, x9, 2 + + addi x5, x5, 20 + add x5, x2, x5 + lw x18, 0(x5) + + bne x9, x0, handle_large_5 + + addi x19, x18, 2 + + addi x6, x0, 3 + blt x19, x6, display_move_5 + sub x19, x19, x6 + jal x0, display_move_5 + +handle_large_5: + lw x6, 20(x2) + + addi x19, x0, 3 + sub x19, x19, x18 + sub x19, x19, x6 + +display_move_5: + la x20, obdata + add x5, x20, x18 + lbu x11, 0(x5) + li x6, 0x6F + xor x11, x11, x6 + addi x11, x11, -0x12 + + add x7, x20, x19 + lbu x12, 0(x7) + xor x12, x12, x6 + addi x12, x12, -0x12 + + mv t1, x11 + mv t2, x12 + li a7, 0x40 + li a0, 1 + la a1, str1 + li a2, 10 + ecall + + addi x10, x9, 1 + jal print_dec + + li a0, 1 + la a1, str2 + li a2, 6 + ecall + + la t0, char_buffer + sb t1, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str3 + li a2, 4 + ecall + + la t0, char_buffer + sb t2, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str4 + li a2, 1 + ecall + + slli x5, x9, 2 + addi x5, x5, 20 + add x5, x2, x5 + + sw x19, 0(x5) + + addi x8, x8, 1 + + srli x5, x8, 1 + + xor x6, x8, x5 + + addi x7, x8, -1 + srli x28, x7, 1 + + xor x7, x7, x28 + + xor x5, x6, x7 + + addi x9, x0, 0 + + andi x6, x5, 1 + + bne x6, x0, disk_found_6 + + addi x9, x0, 1 + + andi x6, x5, 2 + bne x6, x0, disk_found_6 + + addi x9, x0, 2 + +disk_found_6: + andi x30, x5, 5 + addi x31, x0, 5 + beq x30, x31, pattern_match_6 + jal x0, continue_move_6 +pattern_match_6: +continue_move_6: + + slli x5, x9, 2 + + addi x5, x5, 20 + add x5, x2, x5 + lw x18, 0(x5) + + bne x9, x0, handle_large_6 + + addi x19, x18, 2 + + addi x6, x0, 3 + blt x19, x6, display_move_6 + sub x19, x19, x6 + jal x0, display_move_6 + +handle_large_6: + lw x6, 20(x2) + + addi x19, x0, 3 + sub x19, x19, x18 + sub x19, x19, x6 + +display_move_6: + la x20, obdata + add x5, x20, x18 + lbu x11, 0(x5) + li x6, 0x6F + xor x11, x11, x6 + addi x11, x11, -0x12 + + add x7, x20, x19 + lbu x12, 0(x7) + xor x12, x12, x6 + addi x12, x12, -0x12 + + mv t1, x11 + mv t2, x12 + li a7, 0x40 + li a0, 1 + la a1, str1 + li a2, 10 + ecall + + addi x10, x9, 1 + jal print_dec + + li a0, 1 + la a1, str2 + li a2, 6 + ecall + + la t0, char_buffer + sb t1, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str3 + li a2, 4 + ecall + + la t0, char_buffer + sb t2, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str4 + li a2, 1 + ecall + + slli x5, x9, 2 + addi x5, x5, 20 + add x5, x2, x5 + + sw x19, 0(x5) + + addi x8, x8, 1 + + srli x5, x8, 1 + + xor x6, x8, x5 + + addi x7, x8, -1 + srli x28, x7, 1 + + xor x7, x7, x28 + + xor x5, x6, x7 + + addi x9, x0, 0 + + andi x6, x5, 1 + + bne x6, x0, disk_found_7 + + addi x9, x0, 1 + + andi x6, x5, 2 + bne x6, x0, disk_found_7 + + addi x9, x0, 2 + +disk_found_7: + andi x30, x5, 5 + addi x31, x0, 5 + beq x30, x31, pattern_match_7 + jal x0, continue_move_7 +pattern_match_7: +continue_move_7: + + slli x5, x9, 2 + + addi x5, x5, 20 + add x5, x2, x5 + lw x18, 0(x5) + + bne x9, x0, handle_large_7 + + addi x19, x18, 2 + + addi x6, x0, 3 + blt x19, x6, display_move_7 + sub x19, x19, x6 + jal x0, display_move_7 + +handle_large_7: + lw x6, 20(x2) + + addi x19, x0, 3 + sub x19, x19, x18 + sub x19, x19, x6 + +display_move_7: + la x20, obdata + add x5, x20, x18 + lbu x11, 0(x5) + li x6, 0x6F + xor x11, x11, x6 + addi x11, x11, -0x12 + + add x7, x20, x19 + lbu x12, 0(x7) + xor x12, x12, x6 + addi x12, x12, -0x12 + + mv t1, x11 + mv t2, x12 + li a7, 0x40 + li a0, 1 + la a1, str1 + li a2, 10 + ecall + + addi x10, x9, 1 + jal print_dec + + li a0, 1 + la a1, str2 + li a2, 6 + ecall + + la t0, char_buffer + sb t1, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str3 + li a2, 4 + ecall + + la t0, char_buffer + sb t2, 0(t0) + li a0, 1 + mv a1, t0 + li a2, 1 + ecall + + li a0, 1 + la a1, str4 + li a2, 1 + ecall + + slli x5, x9, 2 + addi x5, x5, 20 + add x5, x2, x5 + + sw x19, 0(x5) + + addi x8, x8, 1 + +finish_game: lw x8, 0(x2) lw x9, 4(x2) lw x18, 8(x2) ::: #### Output: ``` 18:10:31 INFO src/riscv.c:552: Log level: TRACE 18:10:31 INFO src/riscv.c:565: test.elf ELF loaded 18:10:31 INFO src/main.c:315: RISC-V emulator is created and ready to run Move Disk 1 from A to C Move Disk 2 from A to B Move Disk 1 from C to B Move Disk 3 from A to C Move Disk 1 from B to A Move Disk 2 from B to C Move Disk 1 from A to C 18:10:31 INFO src/main.c:338: RISC-V emulator is destroyed ``` ### Analysis #### Program size | Section | Original Assembly | Optimized Assembly | | ------- | ----------------- | ------------------ | | text | 2004 | 4056 | | data | 61 | 61 | | bss | 4111 | 4107 | | total | 6176 | 8224 | #### Performance Clock cycle / Instruction count | Column 1 | Column 2 | | -------- | -------- | | 9277 | 9254 | ## Quiz3 Problem C ### Fast Inverse Square Root ![image](https://hackmd.io/_uploads/Sk3gQXjJZe.png) The algorithm is based on **Newton's method**, which is used to approximate the solution for $y = 1/\sqrt{x}$. A key feature of this code is that it **does not rely on a Floating-Point Unit (FPU)**. Instead, it uses 32-bit integer and **fixed-point** arithmetic to perform all calculations. --- ### Operational Flow 1. **Initial Guess**: The `fast_rsqrt` function first calls `clz` (Count Leading Zeros) to estimate the magnitude (exponent) of the input value `x`. It uses this exponent as an index to fetch a rough initial guess from a pre-computed lookup table (`rsqrt_table`). 2. **Interpolation**: To improve the accuracy of the initial guess, the program then performs **linear interpolation** between two adjacent values in the `rsqrt_table` to obtain a more precise starting `y` value. 3. **Newton's Method Iteration**: The program then executes **two** iterations of Newton's method in the `fast_rsqrt.loop`. * The iteration formula it applies is: $y_{n+1} = y_n \left( \frac{3 - xy_n^2}{2} \right)$ * All operations are simulated using fixed-point. For example, the constant `3` is represented as `3 << 16`. * Multiplication (`y*y` and `x*y^2`) is accomplished by calling the custom `mul32` function (a 32x32 -> 64 integer multiplication) combined with bit shifts. * The "divide by 2" in the formula is ultimately implemented via a right shift (`srli ... 17`). ### Code #### C code :::spoiler **Problem_C.c** (Click to unfold) ```c # include <stdint.h> # include <stdlib.h> # include <stdbool.h> # include "print.h" #define REC_INV_SQRT_CACHE (16) static const uint32_t inv_sqrt_cache[REC_INV_SQRT_CACHE] = { ~0U, ~0U, 3037000500, 2479700525, 2147483647, 1920767767, 1753413056, 1623345051, 1518500250, 1431655765, 1358187914, 1294981364, 1239850263, 1191209601, 1147878294, 1108955788 }; static const uint16_t rsqrt_table[32] = { 0, 46341, 32768, 23170, 16384, /* 2^0 to 2^4 */ 11585, 8192, 5793, 4096, 2896, /* 2^5 to 2^9 */ 2048, 1448, 1024, 724, 512, /* 2^10 to 2^14 */ 362, 256, 181, 128, 90, /* 2^15 to 2^19 */ 64, 45, 32, 23, 16, /* 2^20 to 2^24 */ 11, 8, 6, 4, 3, /* 2^25 to 2^29 */ 2, 1, /* 2^30 to 2^31 */ }; /* * Newton iteration: new_y = y * (3/2 - x * y^2 / 2) * Here, y is a Q0.32 fixed-point number (< 1.0) */ static void newton_step(uint32_t *rec_inv_sqrt, uint32_t x){ uint32_t invsqrt, invsqrt2; uint64_t val; invsqrt = *rec_inv_sqrt; /* Dereference pointer */ invsqrt2 = ((uint64_t)invsqrt * invsqrt) >> 32; val = (3LL << 32) - ((uint64_t)x * invsqrt2); val >>= 2; /* Avoid overflow in following multiply */ val = (val * invsqrt) >> 31; /* Right shift by 31 = (32 - 2 + 1) */ *rec_inv_sqrt = (uint32_t)val; } static int clz(uint32_t x){ if(!x)return 32; int n = 0; if(!(x & 0xFFFF0000)) {n += 16; x <<= 16;} if(!(x & 0xFF000000)) {n += 8; x <<= 8;} if(!(x & 0xF0000000)) {n += 4; x <<= 4;} if(!(x & 0xC0000000)) {n += 2; x <<= 2;} if(!(x & 0x80000000)) {n += 1;} return n; } static uint64_t mul32(uint32_t a, uint32_t b){ uint64_t r=0; for(int i=0; i<32; i++){ if(b & (1U<<i)) r += (uint64_t)a<<i; } return r; } static uint64_t mul32_split(uint32_t a, uint32_t b){ uint32_t r_lo = 0; uint32_t r_hi = 0; for(int i = 0; i < 32; i++){ if(b & (1U << i)){ uint32_t add_lo = a << i; uint32_t add_hi = (i == 0) ? 0 : (a >> (32 - i)); uint32_t prev_lo = r_lo; r_lo += add_lo; if (r_lo < prev_lo) { r_hi += 1; } r_hi += add_hi; } } return ((uint64_t)r_hi << 32) | r_lo; } uint32_t fast_rsqrt(uint32_t x){ if(x==0) return 0xffffffff; if(x==1) return 65536; int exp = 31 - clz(x); uint32_t y = rsqrt_table[exp]; if(x > (1U << exp)){ uint32_t y_next = (exp < 31)? rsqrt_table[exp+1]: 0; uint32_t delta = y - y_next; uint32_t frac = (uint32_t) ((((uint64_t)x - (1UL << exp)) << 16) >> exp); y -= (uint32_t) ((delta * frac) >> 16); } for (int iter = 0; iter < 2; iter++){ uint32_t y2 = (uint32_t)mul32(y, y); uint32_t xy2 = (uint32_t)(mul32(x, y2) >> 16); y = (uint32_t)(mul32(y, (3u << 16) - xy2) >> 17); } return y; } int main() { uint32_t result = fast_rsqrt(5); print_dec(result); printstr("\n", 1); return 0; } ``` ::: :::info The linking step was changed from $(LD) to $(CC) (which is gcc). This is because C code requires gcc to automatically link the C runtime libraries. Add -nostartfiles flag. This flag tells gcc not to use its default startup files, and instead, to use the custom start.S in the project as the program's entry point. ```diff --- a/Makefile +++ b/Makefile_forC @@ -8,7 +8,7 @@ AFLAGS = -g $(ARCH) CFLAGS = -g -march=rv32i_zicsr -LDFLAGS = -T $(LINKER_SCRIPT) +LDFLAGS = -T $(LINKER_SCRIPT) -nostartfiles EXEC = test.elf CC = $(CROSS_COMPILE)gcc @@ -16,13 +16,13 @@ LD = $(CROSS_COMPILE)ld OBJDUMP = $(CROSS_COMPILE)objdump ... $(EXEC): $(OBJS) $(LINKER_SCRIPT) - $(LD) $(LDFLAGS) -o $@ $(OBJS) + $(CC) $(LDFLAGS) -o $@ $(OBJS) %.o: %.S $(AS) $(AFLAGS) $< -o $@ ``` ::: #### RV32I Assembly :::spoiler **rsqrt.S** (Click to unfold) ```asm= .text .globl main main: addi sp, sp, -4 sw ra, 0(sp) li a7, 0x40 # syscall: write li a0, 1 # file descriptor (stdout) la a1, str1 # Load address of li a2, 23 # length ecall la t0, test_data lw a0, 0(t0) jal print_dec li a0, 1 la a1, str2 # Load address of li a2, 6 # length ecall la a0, test_data lw a0, 0(a0) jal fast_rsqrt jal print_dec li a0, 1 la a1, str3 # Load address of li a2, 18 # length ecall lw ra, 0(sp) addi sp, sp, 4 ret # ============================================================ # function: fast_rsqrt # ============================================================ newton_step: # Input: a0 = *rec_inv_sqrt (uint32_t pointer) # a1 = x (uint32_t) addi sp, sp, -16 sw ra, 0(sp) sw s0, 4(sp) sw s1, 8(sp) sw s2, 12(sp) mv s0, a0 # s0 = rec_inv_sqrt mv s1, a1 # s1 = x lw s2, 0(s0) # s2 = *rec_inv_sqrt mv a0, s2 # a0 = invsqrt mv a1, s2 # a1 = invsqrt jal ra, mul32 # invsqrt^2, a1 = invsqrt2 = y^2_hi = invsqrt^2 >> 32 mv a0, s1 # a0 = x jal ra, mul32 # prod = x * invsqrt2 li t0, 3 beqz a0, newton_step.zero_prod_lo sub a0, zero, a0 # a0 = val_lo = 0 - prod_lo addi t0, t0, -1 # t0 = 2 newton_step.zero_prod_lo: sub a1, t0, a1 # a1 = val_hi = 3 - prod_hi andi t1, a1, 0x3 # t1 = val_hi & 0x3, lower 2 bits slli t1, t1, 30 # t1 = (val_hi & 0x3) << 30 srli a0, a0, 2 # a0 = val_lo >> 2 or a0, a0, t1 # a0 = (val_hi & 0x3) << 30 | (val_lo >> 2) srli a1, a1, 2 # a1 = val_hi >> 2 mv s1, a1 # s1 = val_hi mv a1, s2 # a1 = invsqrt jal ra, mul32 # new_val_lo = val * invsqrt mv t0, a0 # a0 = new_val_lo_lo mv a0, s1 # a0 = val_hi mv s1, a1 # s1 = new_val_lo_hi mv a1, s2 # a1 = invsqrt mv s2, t0 # s2 = new_val_lo_lo jal ra, mul32 # new_val_hi = invsqrt * val_hi add a0, s1, a0 # a0 = new_val_lo_hi + new_val_hi_lo slli a0, a0, 1 srli s2, s2, 31 # new_val_lo_lo >> 31 or a0, a0, s2 # a0 = new_val_hi + (new_val_lo_lo >> 31) sw a0, 0(s0) # *rec_inv_sqrt = new_val_lo lw ra, 0(sp) lw s0, 4(sp) lw s1, 8(sp) lw s2, 12(sp) addi sp, sp, 16 ret # ============================================================ # function: clz # ============================================================ clz: # Input: a0 = x (uint32_t) # Output: a0 = clz(x) (uint32_t) bnez a0, clz.not_zero li a0, 32 # return 32 for x == 0 ret clz.not_zero: li t0, 0 # n = 0 lui t1, 0xFFFF0 # mask = 0xFFFF0000 and t2, a0, t1 bnez t2, clz.skip1 addi t0, t0, 16 # n += 16 slli a0, a0, 16 clz.skip1: lui t1, 0xFF000 # mask = 0xFF000000 and t2, a0, t1 bnez t2, clz.skip2 addi t0, t0, 8 # n += 8 slli a0, a0, 8 clz.skip2: lui t1, 0xF0000 # mask = 0xF0000000 and t2, a0, t1 bnez t2, clz.skip3 addi t0, t0, 4 # n += 4 slli a0, a0, 4 clz.skip3: lui t1, 0xC0000 # mask = 0xC0000000 and t2, a0, t1 bnez t2, clz.skip4 addi t0, t0, 2 # n += 2 slli a0, a0, 2 clz.skip4: lui t1, 0x80000 # mask = 0x80000000 and t2, a0, t1 bnez t2, clz.end addi t0, t0, 1 # n += 1 clz.end: mv a0, t0 # return n ret # ============================================================ # function: mul32_split # ============================================================ mul32: # Input: a0 = a (uint32_t) # a1 = b (uint32_t) # Output: a0 = r_lo (low 32), a1 = r_hi (high 32) li t0, 0 # i = 0 li t1, 32 # constant 32 li t2, 0 # r_lo = 0 li t3, 0 # r_hi = 0 li t4, 1 # bitmask = 1 mul32.loop: bge t0, t1, mul32.end # if i >= 32, end loop sll t5, t4, t0 # bitmask = 1 << i and t5, a1, t5 # check if b's i-th bit beqz t5, mul32.skip1 # if bit is 0, skip sll t5, a0, t0 # add_lo = a << i li t6,0 beqz t0, mul32.skip2 # if i == 0, skip li t6, 32 sub t6, t6, t0 # 32 - i srl t6, a0, t6 # add_hi = a >> (32 - i) mul32.skip2: add t2, t2, t5 # r_lo += add_lo add t3, t3, t6 # r_hi += add_hi bgeu t2, t5, mul32.skip1 # if no overflow, skip carry addi t3, t3, 1 # if overflow, add carry to high part mul32.skip1: addi t0, t0, 1 # i++ j mul32.loop mul32.end: mv a0, t2 # return r_lo mv a1, t3 # return r_hi ret # ============================================================ # function: fast_rsqrt # ============================================================ fast_rsqrt: # Input: a0 = x (uint32_t) # Output: a0 = fast_rsqrt(x) (uint32_t) bnez a0, fast_rsqrt.not_zero li a0, 0xFFFFFFFF # return 0xFFFFFFFF for x == ret fast_rsqrt.not_zero: li t0, 1 bne a0, t0, fast_rsqrt.not_one li a0, 65536 # return 65536 for x == 1 ret fast_rsqrt.not_one: addi sp, sp, -20 sw ra, 0(sp) sw s0, 4(sp) sw s1, 8(sp) sw s2, 12(sp) sw s3, 16(sp) mv s0, a0 # s0 = x jal ra, clz li t0, 31 sub s1, t0, a0 # s1 = exp = 31 - clz(x) la s2, rsqrt_table slli t0, s1, 1 # t0 = exp * 2 add t0, t0, s2 # t0 = &rsqrt_table[exp] lhu s3, 0(t0) # s3 = y li t0, 1 # t0 = 1 sll t0, t0, s1 # t0 = 1 << exp bge t0, s0, fast_rsqrt.skip1 li t0, 31 li t1, 0 # t1 = y_next = 0 bge s1, t0, fast_rsqrt.skip1_1 addi t1, s1, 1 slli t1, t1, 1 # t1 = (exp + 1) * 2 add t1, t1, s2 # t1 = &rsqrt_table[exp + 1] lhu t1, 0(t1) # t1 = rsqrt_table[exp + 1] fast_rsqrt.skip1_1: sub t2, s3, t1 # t2 = delta = y - y_next li t0, 1 sll t0, t0, s1 # t0 = 1 << exp sub t0, s0, t0 # t0 = x - (1 << exp) slli t0, t0, 16 # t0 = (x - (1 << exp)) << 16 srl t0, t0, s1 # t0 = frac = ((x - (1 << exp)) << 16)/(1 << exp) mv a0, t2 # a0 = delta mv a1, t0 # a1 = frac jal ra, mul32 # prod = delta * frac srli t0, a0, 16 # t0 = prod_lo >> 16 slli t1, a1, 16 # t1 = prod_hi << 16 or t0, t1, t0 # a0 = prod_hi << 16 | (prod_lo >> 16) sub s3, s3, t0 # y -= prod >> 16 fast_rsqrt.skip1: li s1, 0 # s1 = i = 0 li s2, 2 # s2 = 2 fast_rsqrt.loop: bge s1, s2, fast_rsqrt.end_loop mv a0, s3 # a0 = y mv a1, s3 # a1 = y jal ra, mul32 # y^2, a0 = y^2_lo, a1 = y^2_hi or a1, a1, a0 # a1 = y2 = y^2 >> 16 mv a0, s0 # a0 = x jal ra, mul32 # prod = x * y2 slli a1, a1, 16 srli a0, a0, 16 or a1, a1, a0 # a1 = xy2 = x * y2 >> 16 li t0, 3 slli t0, t0, 16 # t0 = 3 << 16 sub a1, t0, a1 mv a0, s3 # a0 = y jal ra, mul32 # new_y = y * (3 << 16 - xy2) srli a0, a0, 17 # new_y_lo >> 17 slli a1, a1, 15 # new_y_hi << 15 or s3, a1, a0 # s3 = new_y_hi << 15 | (new_y_lo >> 17) addi s1, s1, 1 # i++ j fast_rsqrt.loop fast_rsqrt.end_loop: mv a0, s3 # return y lw s3, 16(sp) lw s2, 12(sp) lw s1, 8(sp) lw s0, 4(sp) lw ra, 0(sp) addi sp, sp, 20 ret # ============================================================ # Data Section # ============================================================ .data inv_sqrt_cache: .word 0xFFFFFFFF, 0xFFFFFFFF, 3037000500, 2479700525 .word 2147483647, 1920767767, 1753413056, 1623345051 .word 1518500250, 1431655765, 1358187914, 1294981364 .word 1264197512, 1220703125, 1181116006, 1145324612 rsqrt_table: .half 65535, 46341, 32768, 23170, 16384 .half 11585, 8192, 5793, 4096, 2896 .half 2048, 1448, 1024, 724, 512 .half 362, 256, 181, 128, 90 .half 64, 45, 32, 23, 16 .half 11, 8, 6, 4, 3 .half 2, 1 test_data: .word 4 str1: .asciz "Reverse Square Root of " str2: .asciz " is: " str3: .asciz " in fp32 encoding\n" ``` ::: ### Optimization 1. ```mul32 Function (32x32 multiplication)```: In the original version (rsqrt.S), mul32 contained a loop (mul32.loop) that iterated 32 times. In the optimized version, this loop has been fully unrolled. The author removed all loop-control instructions (bge, j) and sequentially pasted all 32 copies of the loop body (from i = 0 to i = 31). 2. ```fast_rsqrt Function (Newton's Method Iteration)```: In the original version, fast_rsqrt.loop was a loop that executed 2 times for the Newton's method iteration. In the optimized version, this loop was also unrolled. The author duplicated the iteration code twice, allowing it to execute as linear, sequential code. :::spoiler **Modification** (Click to unfold) ```diff= --- a/rsqrt.S +++ b/rsqrt_optim.S @@ -107,32 +107,317 @@ # ============================================================ # function: mul32_split # ============================================================ mul32: # Input: a0 = a (uint32_t) # a1 = b (uint32_t) # Output: a0 = r_lo (low 32), a1 = r_hi (high 32) - li t0, 0 # i = 0 - li t1, 32 # constant 32 li t2, 0 # r_lo = 0 li t3, 0 # r_hi = 0 li t4, 1 # bitmask = 1 -mul32.loop: - bge t0, t1, mul32.end # if i >= 32, end loop - sll t5, t4, t0 # bitmask = 1 << i - and t5, a1, t5 # check if b's i-th bit - beqz t5, mul32.skip1 # if bit is 0, skip - sll t5, a0, t0 # add_lo = a << i - li t6,0 - beqz t0, mul32.skip2 # if i == 0, skip - li t6, 32 - sub t6, t6, t0 # 32 - i - srl t6, a0, t6 # add_hi = a >> (32 - i) -mul32.skip2: - add t2, t2, t5 # r_lo += add_lo - add t3, t3, t6 # r_hi += add_hi - bgeu t2, t5, mul32.skip1 # if no overflow, skip carry - addi t3, t3, 1 # if overflow, add carry to high part -mul32.skip1: - addi t0, t0, 1 # i++ - j mul32.loop + +# for i = 0 + slli t5, t4, 0 # bitmask = 1 << 0 + and t5, a1, t5 # check if b's 0-th bit + beqz t5, mul32.skip1_0 # if bit is 0, skip + slli t5, a0, 0 # add_lo = a << 0 + li t6, 0 # add_hi = 0 + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_0 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_0: + +# for i = 1 + slli t5, t4, 1 # bitmask = 1 << 1 + and t5, a1, t5 # check if b's 1-th bit + beqz t5, mul32.skip1_1 # if bit is 0, skip + slli t5, a0, 1 # add_lo = a << 1 + li t6, 31 # 32 - 1 + srl t6, a0, t6 # add_hi = a >> (32 - 1) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_1 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_1: + +# for i = 2 + slli t5, t4, 2 # bitmask = 1 << 2 + and t5, a1, t5 # check if b's 2-th bit + beqz t5, mul32.skip1_2 # if bit is 0, skip + slli t5, a0, 2 # add_lo = a << 2 + li t6, 30 # 32 - 2 + srl t6, a0, t6 # add_hi = a >> (32 - 2) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_2 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_2: + +# for i = 3 + slli t5, t4, 3 # bitmask = 1 << 3 + and t5, a1, t5 # check if b's 3-th bit + beqz t5, mul32.skip1_3 # if bit is 0, skip + slli t5, a0, 3 # add_lo = a << 3 + li t6, 29 # 32 - 3 + srl t6, a0, t6 # add_hi = a >> (32 - 3) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_3 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_3: + +# for i = 4 + slli t5, t4, 4 # bitmask = 1 << 4 + and t5, a1, t5 # check if b's 4-th bit + beqz t5, mul32.skip1_4 # if bit is 0, skip + slli t5, a0, 4 # add_lo = a << 4 + li t6, 28 # 32 - 4 + srl t6, a0, t6 # add_hi = a >> (32 - 4) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_4 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_4: + +# for i = 5 + slli t5, t4, 5 # bitmask = 1 << 5 + and t5, a1, t5 # check if b's 5-th bit + beqz t5, mul32.skip1_5 # if bit is 0, skip + slli t5, a0, 5 # add_lo = a << 5 + li t6, 27 # 32 - 5 + srl t6, a0, t6 # add_hi = a >> (32 - 5) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_5 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_5: + +# for i = 6 + slli t5, t4, 6 # bitmask = 1 << 6 + and t5, a1, t5 # check if b's 6-th bit + beqz t5, mul32.skip1_6 # if bit is 0, skip + slli t5, a0, 6 # add_lo = a << 6 + li t6, 26 # 32 - 6 + srl t6, a0, t6 # add_hi = a >> (32 - 6) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_6 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_6: + +# for i = 7 + slli t5, t4, 7 # bitmask = 1 << 7 + and t5, a1, t5 # check if b's 7-th bit + beqz t5, mul32.skip1_7 # if bit is 0, skip + slli t5, a0, 7 # add_lo = a << 7 + li t6, 25 # 32 - 7 + srl t6, a0, t6 # add_hi = a >> (32 - 7) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_7 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_7: + +# for i = 8 + slli t5, t4, 8 # bitmask = 1 << 8 + and t5, a1, t5 # check if b's 8-th bit + beqz t5, mul32.skip1_8 # if bit is 0, skip + slli t5, a0, 8 # add_lo = a << 8 + li t6, 24 # 32 - 8 + srl t6, a0, t6 # add_hi = a >> (32 - 8) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_8 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_8: + +# for i = 9 + slli t5, t4, 9 # bitmask = 1 << 9 + and t5, a1, t5 # check if b's 9-th bit + beqz t5, mul32.skip1_9 # if bit is 0, skip + slli t5, a0, 9 # add_lo = a << 9 + li t6, 23 # 32 - 9 + srl t6, a0, t6 # add_hi = a >> (32 - 9) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_9 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_9: + +# for i = 10 + slli t5, t4, 10 # bitmask = 1 << 10 + and t5, a1, t5 # check if b's 10-th bit + beqz t5, mul32.skip1_10 # if bit is 0, skip + slli t5, a0, 10 # add_lo = a << 10 + li t6, 22 # 32 - 10 + srl t6, a0, t6 # add_hi = a >> (32 - 10) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_10 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_10: + +# for i = 11 + slli t5, t4, 11 # bitmask = 1 << 11 + and t5, a1, t5 # check if b's 11-th bit + beqz t5, mul32.skip1_11 # if bit is 0, skip + slli t5, a0, 11 # add_lo = a << 11 + li t6, 21 # 32 - 11 + srl t6, a0, t6 # add_hi = a >> (32 - 11) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_11 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_11: + +# for i = 12 + slli t5, t4, 12 # bitmask = 1 << 12 + and t5, a1, t5 # check if b's 12-th bit + beqz t5, mul32.skip1_12 # if bit is 0, skip + slli t5, a0, 12 # add_lo = a << 12 + li t6, 20 # 32 - 12 + srl t6, a0, t6 # add_hi = a >> (32 - 12) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_12 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_12: + +# for i = 13 + slli t5, t4, 13 # bitmask = 1 << 13 + and t5, a1, t5 # check if b's 13-th bit + beqz t5, mul32.skip1_13 # if bit is 0, skip + slli t5, a0, 13 # add_lo = a << 13 + li t6, 19 # 32 - 13 + srl t6, a0, t6 # add_hi = a >> (32 - 13) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_13 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_13: + +# for i = 14 + slli t5, t4, 14 # bitmask = 1 << 14 + and t5, a1, t5 # check if b's 14-th bit + beqz t5, mul32.skip1_14 # if bit is 0, skip + slli t5, a0, 14 # add_lo = a << 14 + li t6, 18 # 32 - 14 + srl t6, a0, t6 # add_hi = a >> (32 - 14) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_14 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_14: + +# for i = 15 + slli t5, t4, 15 # bitmask = 1 << 15 + and t5, a1, t5 # check if b's 15-th bit + beqz t5, mul32.skip1_15 # if bit is 0, skip + slli t5, a0, 15 # add_lo = a << 15 + li t6, 17 # 32 - 15 + srl t6, a0, t6 # add_hi = a >> (32 - 15) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_15 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_15: + +# for i = 16 + slli t5, t4, 16 # bitmask = 1 << 16 + and t5, a1, t5 # check if b's 16-th bit + beqz t5, mul32.skip1_16 # if bit is 0, skip + slli t5, a0, 16 # add_lo = a << 16 + li t6, 16 # 32 - 16 + srl t6, a0, t6 # add_hi = a >> (32 - 16) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_16 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_16: + +# for i = 17 + slli t5, t4, 17 # bitmask = 1 << 17 + and t5, a1, t5 # check if b's 17-th bit + beqz t5, mul32.skip1_17 # if bit is 0, skip + slli t5, a0, 17 # add_lo = a << 17 + li t6, 15 # 32 - 17 + srl t6, a0, t6 # add_hi = a >> (32 - 17) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_17 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_17: + +# for i = 18 + slli t5, t4, 18 # bitmask = 1 << 18 + and t5, a1, t5 # check if b's 18-th bit + beqz t5, mul32.skip1_18 # if bit is 0, skip + slli t5, a0, 18 # add_lo = a << 18 + li t6, 14 # 32 - 18 + srl t6, a0, t6 # add_hi = a >> (32 - 18) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_18 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_18: + +# for i = 19 + slli t5, t4, 19 # bitmask = 1 << 19 + and t5, a1, t5 # check if b's 19-th bit + beqz t5, mul32.skip1_19 # if bit is 0, skip + slli t5, a0, 19 # add_lo = a << 19 + li t6, 13 # 32 - 19 + srl t6, a0, t6 # add_hi = a >> (32 - 19) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_19 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_19: + +# for i = 20 + slli t5, t4, 20 # bitmask = 1 << 20 + and t5, a1, t5 # check if b's 20-th bit + beqz t5, mul32.skip1_20 # if bit is 0, skip + slli t5, a0, 20 # add_lo = a << 20 + li t6, 12 # 32 - 20 + srl t6, a0, t6 # add_hi = a >> (32 - 20) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_20 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_20: + +# for i = 21 + slli t5, t4, 21 # bitmask = 1 << 21 + and t5, a1, t5 # check if b's 21-th bit + beqz t5, mul32.skip1_21 # if bit is 0, skip + slli t5, a0, 21 # add_lo = a << 21 + li t6, 11 # 32 - 21 + srl t6, a0, t6 # add_hi = a >> (32 - 21) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_21 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_21: + +# for i = 22 + slli t5, t4, 22 # bitmask = 1 << 22 + and t5, a1, t5 # check if b's 22-th bit + beqz t5, mul32.skip1_22 # if bit is 0, skip + slli t5, a0, 22 # add_lo = a << 22 + li t6, 10 # 32 - 22 + srl t6, a0, t6 # add_hi = a >> (32 - 22) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_22 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_22: + +# for i = 23 + slli t5, t4, 23 # bitmask = 1 << 23 + and t5, a1, t5 # check if b's 23-th bit + beqz t5, mul32.skip1_23 # if bit is 0, skip + slli t5, a0, 23 # add_lo = a << 23 + li t6, 9 # 32 - 23 + srl t6, a0, t6 # add_hi = a >> (32 - 23) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_23 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_23: + +# for i = 24 + slli t5, t4, 24 # bitmask = 1 << 24 + and t5, a1, t5 # check if b's 24-th bit + beqz t5, mul32.skip1_24 # if bit is 0, skip + slli t5, a0, 24 # add_lo = a << 24 + li t6, 8 # 32 - 24 + srl t6, a0, t6 # add_hi = a >> (32 - 24) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_24 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_24: + +# for i = 25 + slli t5, t4, 25 # bitmask = 1 << 25 + and t5, a1, t5 # check if b's 25-th bit + beqz t5, mul32.skip1_25 # if bit is 0, skip + slli t5, a0, 25 # add_lo = a << 25 + li t6, 7 # 32 - 25 + srl t6, a0, t6 # add_hi = a >> (32 - 25) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_25 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_25: + +# for i = 26 + slli t5, t4, 26 # bitmask = 1 << 26 + and t5, a1, t5 # check if b's 26-th bit + beqz t5, mul32.skip1_26 # if bit is 0, skip + slli t5, a0, 26 # add_lo = a << 26 + li t6, 6 # 32 - 26 + srl t6, a0, t6 # add_hi = a >> (32 - 26) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_26 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_26: + +# for i = 27 + slli t5, t4, 27 # bitmask = 1 << 27 + and t5, a1, t5 # check if b's 27-th bit + beqz t5, mul32.skip1_27 # if bit is 0, skip + slli t5, a0, 27 # add_lo = a << 27 + li t6, 5 # 32 - 27 + srl t6, a0, t6 # add_hi = a >> (32 - 27) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_27 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_27: + +# for i = 28 + slli t5, t4, 28 # bitmask = 1 << 28 + and t5, a1, t5 # check if b's 28-th bit + beqz t5, mul32.skip1_28 # if bit is 0, skip + slli t5, a0, 28 # add_lo = a << 28 + li t6, 4 # 32 - 28 + srl t6, a0, t6 # add_hi = a >> (32 - 28) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_28 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_28: + +# for i = 29 + slli t5, t4, 29 # bitmask = 1 << 29 + and t5, a1, t5 # check if b's 29-th bit + beqz t5, mul32.skip1_29 # if bit is 0, skip + slli t5, a0, 29 # add_lo = a << 29 + li t6, 3 # 32 - 29 + srl t6, a0, t6 # add_hi = a >> (32 - 29) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_29 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_29: + +# for i = 30 + slli t5, t4, 30 # bitmask = 1 << 30 + and t5, a1, t5 # check if b's 30-th bit + beqz t5, mul32.skip1_30 # if bit is 0, skip + slli t5, a0, 30 # add_lo = a << 30 + li t6, 2 # 32 - 30 + srl t6, a0, t6 # add_hi = a >> (32 - 30) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_30 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_30: + +# for i = 31 + slli t5, t4, 31 # bitmask = 1 << 31 + and t5, a1, t5 # check if b's 31-th bit + beqz t5, mul32.skip1_31 # if bit is 0, skip + slli t5, a0, 31 # add_lo = a << 31 + li t6, 1 # 32 - 31 + srl t6, a0, t6 # add_hi = a >> (32 - 31) + add t2, t2, t5 # r_lo += add_lo + add t3, t3, t6 # r_hi += add_hi + bgeu t2, t5, mul32.skip1_31 # if no overflow, skip carry + addi t3, t3, 1 # if overflow, add carry to high part +mul32.skip1_31: + mul32.end: mv a0, t2 # return r_lo mv a1, t3 # return r_hi @@ -196,10 +481,7 @@ slli t0, t0, 16 # t0 = (x - (1 << exp)) << 16 srl t0, t0, s1 # t0 = frac = ((x - (1 << exp)) << 16)/(1 << exp) mv a0, t2 # a0 = delta - mv a1, t0 # a1 = frac - jal ra, mul32 # prod = delta * frac - srli t0, a0, 16 # t0 = prod_lo >> 16 - slli t1, a1, 16 # t1 = prod_hi << 16 + mv a1, t0 # a1 = frac + jal ra, mul32 # prod = delta * frac + srli t0, a0, 16 # t0 = prod_lo >> 16 + slli t1, a1, 16 # t1 = prod_hi << 16 or t0, t1, t0 # a0 = prod_hi << 16 | (prod_lo >> 16) sub s3, s3, t0 # y -= prod >> 16 fast_rsqrt.skip1: - li s1, 0 # s1 = i = 0 - li s2, 2 # s2 = 2 -fast_rsqrt.loop: - bge s1, s2, fast_rsqrt.end_loop mv a0, s3 # a0 = y mv a1, s3 # a1 = y jal ra, mul32 # y^2, a0 = y^2_lo, a1 = y^2_hi @@ -217,9 +499,22 @@ srli a0, a0, 17 # new_y_lo >> 17 slli a1, a1, 15 # new_y_hi << 15 or s3, a1, a0 # s3 = new_y_hi << 15 | (new_y_lo >> 17) - addi s1, s1, 1 # i++ - j fast_rsqrt.loop -fast_rsqrt.end_loop: + mv a0, s3 # a0 = y + mv a1, s3 # a1 = y + jal ra, mul32 # y^2, a0 = y^2_lo, a1 = y^2_hi + or a1, a1, a0 # a1 = y2 = y^2 >> 16 + mv a0, s0 # a0 = x + jal ra, mul32 # prod = x * y2 + slli a1, a1, 16 + srli a0, a0, 16 + or a1, a1, a0 # a1 = xy2 = x * y2 >> 16 + li t0, 3 + slli t0, t0, 16 # t0 = 3 << 16 + sub a1, t0, a1 + mv a0, s3 # a0 = y + jal ra, mul32 # new_y = y * (3 << 16 - xy2) + srli a0, a0, 17 # new_y_lo >> 17 + slli a1, a1, 15 # new_y_hi << 15 + or s3, a1, a0 # s3 = new_y_hi << 15 | (new_y_lo >> 17) + +fast_rsqrt.end_loop: mv a0, s3 # return y lw s3, 16(sp) lw s2, 12(sp) ``` ::: #### Output: ``` 17:26:19 INFO src/riscv.c:552: Log level: TRACE 17:26:19 INFO src/riscv.c:565: test.elf ELF loaded 17:26:19 INFO src/main.c:315: RISC-V emulator is created and ready to run Reverse Square Root of 19 is: 15034 in fp32 encoding 17:26:19 INFO src/main.c:338: RISC-V emulator is destroyed ``` ### Analysis #### Program size | Section | C code | Original Assembly | Optimized Assembly | | ------- | ------ | ----------------- | ------------------ | | text | 4125 | 2380 | 3632 | | data | 0 | 214 | 214 | | bss | 4099 | 4110 | 4106 | | total | 8224 | 6704 | 7952 | #### Performance - best case(exception): fast_rsqrt(0) - best case(normal): fast_rsqrt(2) - worst case: fast_rsqrt(3) Clock cycles / Instruction count: | Cases | C code | Original Assembly | Optimized Assembly | | --------------- | ------ | ----------------- | ------------------ | | Best(exception) | 64 | 22 | 22 | | Best(normal) | 3381 | 1564 | 885 | | Worst | 4561 | 2111 | 1225 | #### Precision The script iterates through all integer values of x from 1 to 65535. It compares the "approximate value" (y_approx) derived from the table against the "exact value" (y_exact) computed using math.sqrt(), and then calculates the relative error between them. - Maximum Relative Error: **0.414192** - Average Relative Error: **0.218814** ![rsqrt_initial_error_plot](https://hackmd.io/_uploads/r1xlqo3Jbe.png) :::spoiler **Code** (Click to unfold) ```python= import math import numpy as np import matplotlib.pyplot as plt rsqrt_table = np.array([65535, 46341, 32768, 23170, 16384, 11585, 8192, 5793, 4096, 2896, 2048, 1448, 1024, 724, 512, 362, 256, 181, 128, 90, 64, 45, 32, 23, 16, 11, 8, 6, 4, 3, 2, 1], dtype=np.uint16) def compute_initial_error(x): if x == 0: return np.nan clz = 32 - (int(x).bit_length() if x > 0 else 0) exp = 31 - clz if exp < 0 or exp >= 32: raise ValueError(f"Invalid exp {exp} for x={x}") y_init = rsqrt_table[exp] y_approx = y_init / 65536.0 y_exact = 1.0 / math.sqrt(x) return abs(y_approx - y_exact) / y_exact x_values = np.arange(1, 0x10000) # 1 to 65535 errors = np.array([compute_initial_error(x) for x in x_values]) max_err = np.nanmax(errors) min_err = np.nanmin(errors) avg_err = np.nanmean(errors) print(f"Maximum Relative Error: {max_err:.6f}") print(f"Minimum Relative Error: {min_err:.6f}") print(f"Average Relative Error: {avg_err:.6f}") plt.figure(figsize=(10, 6)) plt.plot(x_values, errors, label='Relative Error', color='blue', alpha=0.7) plt.xscale('log') plt.xlabel('x Value (log scale)') plt.ylabel('Relative Error') plt.title('Distribution of Initial Relative Error in Fast Rsqrt Guess (x=1 to 65535)') plt.grid(True) plt.legend() plt.savefig('rsqrt_initial_error_plot.png') plt.show() ``` ::: ## References * [Lab2: RISC-V Instruction Set Simulator and System Emulator](https://hackmd.io/@sysprog/Sko2Ja5pel#Setup) * [riscv-none-elf-gcc-xpack](https://github.com/xpack-dev-tools/riscv-none-elf-gcc-xpack) * [rv32emu](https://github.com/sysprog21/rv32emu) * [Approximating sine function in BF16 data type](https://hackmd.io/@Max042004/bf16_sin)