# Assignment2: Complete Applications
contributed by < [Wei-Chen Lai](https://github.com/Winstonllllai) >
> [GitHub](https://github.com/Winstonllllai/ca2025-HW2)
## Set Up Environment On MacOS
1. Install RISC-V GNU Toolchain
open terminal and execute instructions
```bash
wget https://github.com/xpack-dev-tools/riscv-none-elf-gcc-xpack/releases/download/v15.2.0-1/xpack-riscv-none-elf-gcc-15.2.0-1-darwin-arm64.tar.gz
tar zxvf xpack-riscv-none-elf-gcc-15.2.0-1-darwin-arm64.tar.gz
cp -af xpack-riscv-none-elf-gcc-15.2.0-1 $HOME/riscv-none-elf-gcc
cd $HOME/riscv-none-elf-gcc
echo "export PATH=`pwd`/bin:\$PATH" > setenv
source $HOME/riscv-none-elf-gcc/setenv
```
validate the installation is successful
```bash
riscv-none-embed-gcc -v
```
output:
```
Using built-in specs.
COLLECT_GCC=riscv-none-elf-gcc
COLLECT_LTO_WRAPPER=/Users/winston/riscv-none-elf-gcc/bin/../libexec/gcc/riscv-none-elf/14.2.0/lto-wrapper
Target: riscv-none-elf
Configured with: /Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/sources/gcc-14.2.0/configure --prefix=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/application --with-sysroot=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/application/riscv-none-elf --with-native-system-header-dir=/include --infodir=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/aarch64-apple-darwin20.6.0/install/share/info --mandir=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/aarch64-apple-darwin20.6.0/install/share/man --htmldir=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/aarch64-apple-darwin20.6.0/install/share/html --pdfdir=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/aarch64-apple-darwin20.6.0/install/share/pdf --build=aarch64-apple-darwin20.6.0 --host=aarch64-apple-darwin20.6.0 --target=riscv-none-elf --disable-nls --disable-shared --disable-threads --disable-tls --enable-checking=release --enable-languages=c,c++ --with-gmp=/Users/ilg/actions-runners/xpack-dev-tools/_work/riscv-none-elf-gcc-xpack/riscv-none-elf-gcc-xpack/build-assets/build/darwin-arm64/aarch64-apple-darwin20.6.0/install --with-newlib --with-pkgversion='xPack GNU RISC-V Embedded GCC arm64' --with-gnu-as --with-gnu-ld --with-system-zlib --with-abi=ilp32 --with-arch=rv32imac --enable-multilib
Thread model: single
Supported LTO compression algorithms: zlib zstd
gcc version 14.2.0 (xPack GNU RISC-V Embedded GCC arm64)
```
2. Install Dependency
```bash
brew install sdl2 sdl2_mixer
```
3. Build rv32emu
```bash=
git clone https://github.com/sysprog21/rv32emu
cd rv32emu
make
```
4. Make Test
test result:
```
Linux image is found. Skipping downloading.
Running hello.elf ... [OK]
Fetching SHA-1 of prebuilt binaries ... skipped
Checking SHA-1 of prebuilt binaries ... [OK]
Running puzzle ... [OK]
Running fcalc ... [OK]
Running pi ... [OK]
```
## Run Program on Bare Metal
### Print on Terminal
In the previous assignment (arch2025-homework1), we used the [Ripes](https://github.com/mortbopet/Ripes) simulator for RV32I instruction simulation. To print any information to the terminal, we made direct ecalls. Here is a code segment from HW1 Problem B as an example:
```
250: produced value 0xcfff0, re-encoded to 250
```
The instructions are as follows:
```asm=
mv a0, t4 # a0 = fl
li a7, 1 # syscall: print integer
ecall
la a0, str1 # Load address of str1
li a7, 4 # syscall: print string
ecall
mv a0, t5 # a0 = value
li a7, 34 # syscall: print hex integer
ecall
la a0, str2 # Load address of str2
li a7, 4 # syscall: print string
ecall
mv a0, t6 # a0 = fl2
li a7, 1 # syscall: print integer
ecall
la a0, str6 # Load address of str6
li a7, 4 # syscall: print string
ecall # syscall: print string
.data
str1: .string ": produced value "
str2: .string ", re-encoded to "
str6: .string "\n"
```
However, when doing bare-metal development with [rv32emu](https://github.com/sysprog21/rv32emu), the system call mechanism is different from that of Ripes. Therefore, this functionality was encapsulated into print.c, allowing it to be invoked as a standard function.
#### Code
::: spoiler **print.c** (Click to unfold)
```c
/* Bare metal memcpy implementation */
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include "print.h"
void *memcpy(void *dest, const void *src, size_t n)
{
uint8_t *d = (uint8_t *) dest;
const uint8_t *s = (const uint8_t *) src;
while (n--)
*d++ = *s++;
return dest;
}
/* Software division for RV32I (no M extension) */
unsigned long udiv(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long quotient = 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
quotient |= (1UL << i);
}
}
return quotient;
}
unsigned long umod(unsigned long dividend, unsigned long divisor)
{
if (divisor == 0)
return 0;
unsigned long remainder = 0;
for (int i = 31; i >= 0; i--) {
remainder <<= 1;
remainder |= (dividend >> i) & 1;
if (remainder >= divisor) {
remainder -= divisor;
}
}
return remainder;
}
/* Software multiplication for RV32I (no M extension) */
uint32_t umul(uint32_t a, uint32_t b)
{
uint32_t result = 0;
while (b) {
if (b & 1)
result += a;
a <<= 1;
b >>= 1;
}
return result;
}
/* Provide __mulsi3 for GCC */
uint32_t __mulsi3(uint32_t a, uint32_t b)
{
return umul(a, b);
}
/* Simple integer to hex string conversion */
void print_hex(unsigned long val)
{
char buf[20];
char *p = buf + sizeof(buf) - 1;
if (val == 0) {
*p = '0';
p--;
} else {
while (val > 0) {
int digit = val & 0xf;
*p = (digit < 10) ? ('0' + digit) : ('a' + digit - 10);
p--;
val >>= 4;
}
}
p++;
printstr(p, (buf + sizeof(buf) - p));
}
/* Simple integer to 2-digit hex string conversion (zero-padded) */
void print_hex_byte(uint8_t val)
{
char buf[2];
int digit1 = (val >> 4) & 0xf;
int digit2 = val & 0xf;
buf[0] = (digit1 < 10) ? ('0' + digit1) : ('a' + digit1 - 10);
buf[1] = (digit2 < 10) ? ('0' + digit2) : ('a' + digit2 - 10);
printstr(buf, 2);
}
/* Simple integer to decimal string conversion */
void print_dec(unsigned long val)
{
char buf[20];
char *p = buf + sizeof(buf) - 1;
if (val == 0) {
*p = '0';
p--;
} else {
while (val > 0) {
*p = '0' + umod(val, 10);
p--;
val = udiv(val, 10);
}
}
p++;
printstr(p, (buf + sizeof(buf) - p));
}
```
:::
:::spoiler **print.h** (Click to unfold)
```c
# ifndef PRINT_H
# define PRINT_H
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#define printstr(ptr, length) \
do { \
asm volatile( \
"add a7, x0, 0x40;" \
"add a0, x0, 0x1;" /* stdout */ \
"add a1, x0, %0;" \
"mv a2, %1;" /* length character */ \
"ecall;" \
: \
: "r"(ptr), "r"(length) \
: "a0", "a1", "a2", "a7"); \
} while (0)
#define TEST_OUTPUT(msg, length) printstr(msg, length)
#define TEST_LOGGER(msg) \
{ \
char _msg[] = msg; \
TEST_OUTPUT(_msg, sizeof(_msg) - 1); \
}
extern uint64_t get_cycles(void);
extern uint64_t get_instret(void);
void *memcpy(void *dest, const void *src, size_t n);
unsigned long udiv(unsigned long dividend, unsigned long divisor);
unsigned long umod(unsigned long dividend, unsigned long divisor);
uint32_t umul(uint32_t a, uint32_t b);
uint32_t __mulsi3(uint32_t a, uint32_t b);
void print_hex(unsigned long val);
void print_hex_byte(uint8_t val);
void print_dec(unsigned long val);
#endif
```
:::
### Linker Script
- Purpose: Draws the "memory map."
- It tells the linker where in memory to place the code (.text) and data (.data) (e.g., at 0x10000).
- It defines symbols the C environment needs, most importantly, where the top of the stack is (__stack_top).
```
High Address ⬆︎
+-----------------------+
| .stack | <-- Stack Section (4KB)
| (...Free Space...) |
+-----------------------+ <-- __stack_top (Symbol)
| (Free Space...) |
| |
+-----------------------+ <-- __bss_end (Symbol)
| .bss | <-- Uninitialized Globals
+-----------------------+ <-- __bss_start (Symbol)
| .data | <-- Initialized Data
| (str1: "Move Disk ") |
| (obdata: 0x3c, 0x3b) |
+-----------------------+
| .text | <-- Code Section
| (print_dec func...) |
| (main func...) |
| (_start func...) |
0x10000+-----------------------+ <-- Program Start Address
| (Reserved) |
0x00000+-----------------------+
Low Address ⬇︎
```
```linker=
OUTPUT_ARCH("riscv");
ENTRY(_start);
SECTIONS
{
. = 0x10000;
.text : {
*(.text._start)
*(.text)
}
.data : {
*(.data)
}
.bss : {
__bss_start = .;
*(.bss)
__bss_end = .;
}
.stack (NOLOAD) : {
. = ALIGN(16);
__stack_start = .;
. += 4096;
__stack_top = .;
}
}
```
### Makefile
- Purpose: To automate the build process.
- It compiles all your .c files and .S files into .o object files. Then, it tells the linker (ld) to use the linker.ld's rules ("the map") to "glue" all the parts (like start.o and your code) together into a single test.elf executable.
```makefile=
CROSS_COMPILE = riscv-none-elf-
include ../../../mk/toolchain.mk
ARCH = -march=rv32izicsr
LINKER_SCRIPT = linker.ld
EMU ?= ../../../build/rv32emu
AFLAGS = -g $(ARCH)
CFLAGS = -g -march=rv32i_zicsr
LDFLAGS = -T $(LINKER_SCRIPT)
EXEC = test.elf
CC = $(CROSS_COMPILE)gcc
AS = $(CROSS_COMPILE)as
LD = $(CROSS_COMPILE)ld
OBJDUMP = $(CROSS_COMPILE)objdump
OBJS = start.o uf8_optim.o print.o
.PHONY: all run dump clean
all: $(EXEC)
$(EXEC): $(OBJS) $(LINKER_SCRIPT)
$(LD) $(LDFLAGS) -o $@ $(OBJS)
%.o: %.S
$(AS) $(AFLAGS) $< -o $@
%.o: %.c
$(CC) $(CFLAGS) $< -o $@ -c
run: $(EXEC)
@test -f $(EMU) || (echo "Error: $(EMU) not found" && exit 1)
@grep -q "ENABLE_ELF_LOADER=1" ../../../build/.config || (echo "Error: ENABLE_ELF_LOADER=1 not set" && exit 1)
@grep -q "ENABLE_SYSTEM=1" ../../../build/.config || (echo "Error: ENABLE_SYSTEM=1 not set" && exit 1)
$(EMU) $<
dump: $(EXEC)
$(OBJDUMP) -Ds $< | less
clean:
rm -f $(EXEC) $(OBJS)v
```
:::info
Defining CROSS_COMPILE = riscv-none-elf- is necessary to override the toolchain.mk script's auto-detection.
Without it, the script may incorrectly select a 64-bit toolchain, which conflicts with this 32-bit project (-march=rv32izicsr). Setting this variable before the include forces the build to use the correct 32-bit compiler (CC), assembler (AS), and linker (LD), preventing a 32/64-bit incompatibility error.
:::
### Start up Code
- Purpose: Does the "prep work" before main.
- Sets the Stack Pointer (sp): This is required for C function calls to work. It runs la sp, __stack_top, using the symbol from the linker script.
- Prepares the C environment: Clears the BSS section (zeroing out uninitialized global variables).
- Calls main: After setup, it finally executes call main to run your C code.
```asm=
.section .text._start
.globl _start
.type _start, @function
_start:
la sp, __stack_top
la t0, __bss_start
la t1, __bss_end
clear_bss:
bge t0, t1, bss_cleared
sw zero, 0(t0)
addi t0, t0, 4
j clear_bss
bss_cleared:
call main
li a7, 93
li a0, 0
ecall
infinite_loop:
j infinite_loop
.size _start, .-_start
.weak __bss_start
.weak __bss_end
.weak __stack_top
```
### Analysis
#### Program size
To display section size, execute instruction below:
```
riscv-none-elf-size build/hello.elf
```
Expected output:
```
text data bss dec hex filename
80 0 0 80 50 build/hello.elf
```
#### Performance
1. **CPU Execution Time**
The total time a program takes to run is a function of the total CPU cycles it consumes and the CPU's clock speed.
$$\text{Execution Time} = \frac{\text{Total Cycles}}{\text{Clock Rate}}$$
* **Total Cycles**: The total number of CPU cycles the program took to complete.
* **Clock Rate**: How many cycles the CPU can execute per second (e.g., 3 GHz).
2. **Performance Decomposition**
The "Total Cycles" from the first formula can be broken down further, giving the most famous and important performance equation:
$$\text{Execution Time} = \frac{\text{Instructions}}{\text{Program}} \times \frac{\text{Cycles}}{\text{Instruction}} \times \frac{\text{Time}}{\text{Cycle}}$$
We can simplify these three terms into:
$$\text{Execution Time} = \text{Instruction Count} \times \text{CPI} \times \text{Clock Cycle Time}$$
* **Instruction Count (IC)**: The total number of instructions executed by the program.
* **CPI (Cycles Per Instruction)**: How many CPU cycles it takes to execute **one instruction on average**. This is a key metric for CPU architecture efficiency.
* **Clock Cycle Time**: The time required for one CPU cycle (e.g., for a 3 GHz Clock Rate, the Clock Cycle Time is 1/3G seconds).
3. **IPC (Instructions Per Cycle)**
IPC is the inverse of CPI. It measures how many instructions are executed in **one CPU cycle on average**.
$$\text{IPC} = \frac{1}{\text{CPI}}$$
* **IPC > 1**: Represents a "Superscalar" architecture that can complete more than one instruction per cycle on average.
* **IPC < 1**: Represents an architecture where one instruction takes more than one cycle on average.
**```getcycles.S```**:
- Provides the get_cycles function.
- Used to read the 64-bit CPU cycle counter (cycleh and cycle).
- It reads the high bits, then the low bits, then the high bits again. If the two high-bit reads do not match, it retries.
```asm=
.text
.globl get_cycles
.align 2
get_cycles:
csrr a1, cycleh
csrr a0, cycle
csrr a2, cycleh
bne a1, a2, get_cycles
ret
.size get_cycles,.-get_cycles
```
**```getinstret.S```**:
- Provides the get_instret function.
- Used to read the 64-bit instruction-retired counter (instreth and instret).
- It uses the exact same "high-low-high" retry logic as getcycles.S.
```asm=
.text
.globl get_instret
.align 2
get_instret:
csrr a1, instreth
csrr a0, instret
csrr a2, instreth
bne a1, a2, get_instret
ret
.size get_instret,.-get_instret
```
:::info
By default, rv32emu is a "Functional Simulator," meaning its goal is correctness (ensuring every instruction executes correctly) rather than "Timing Simulation" (simulating the precise time each instruction takes).
As we can see, in ```src/emulate.c```, both ```CSR_CYCLE``` and ```CSR_INSTRET``` points to ```rv->csr_cycle```. So ==**CPI will always be 1**==.
```c
// rv32emu/src/emulate.c
case CSR_CYCLE: /* Cycle counter for RDCYCLE instruction */
return (uint32_t *) &rv->csr_cycle;
case CSR_CYCLEH: /* Upper 32 bits of cycle */
return &((uint32_t *) &rv->csr_cycle)[1];
...
case CSR_INSTRET: /* Number of Instructions Retired Counter */
return (uint32_t *) (&rv->csr_cycle);
case CSR_INSTRETH: /* Upper 32 bits of instructions retired */
return &((uint32_t *) &rv->csr_cycle)[1];
```
:::
## HW1 Problem B
The code is adapted from the previous assignment [(arch2025-homework1-Problem B)](https://hackmd.io/0mzMhln2To2AGMOu9rz3Pg?both#Problem-B).
### uf8
`uf8` is a specialized 8-bit unsigned numerical representation designed for data compression. This scheme enables the storage of a dynamic range far exceeding that of a standard `uint8_t` (0-255) within a single byte (8 bits). Its core principle involves a non-linear quantization strategy that allocates the limited bits between an exponent and a mantissa, achieving a balance between numerical dynamic range and resolution precision.
* **Bit Layout**
```
┌──────────────┬──────────────┐
│ Exponent (4) │ Mantissa (4) │
└──────────────┴──────────────┘
E: Exponent bits (4 bits)
M: Mantissa/fraction bits (4 bits)
```
* **Decoding**
$D(b) = m \cdot 2^e + (2^e - 1) \cdot 16$
where $e = \lfloor b/16 \rfloor$ and $m = b \bmod 16$
* **Encoding**
$E(v) = \begin{cases}
v, & \text{if } v < 16 \\
16e + \lfloor(v - \text{offset}(e))/2^e\rfloor, & \text{otherwise}
\end{cases}$
Where $\text{offset}(e) = (2^e - 1) \cdot 16$
### `clz` optimization
It is a processor instruction that counts the number of consecutive zero bits in a binary number, starting from the most significant bit (the left side) until the first 1 is found.
Its main purpose is to quickly determine a number's magnitude or to normalize it for floating-point operations.
The clz function was optimized using loop unrolling.
The original iterative loop was replaced with a linear sequence of instructions that explicitly performs each step. This improves performance by eliminating loop control overhead and branch instructions, at the cost of a slight increase in code size.
* **Original**
```assembly=
clz:
# Input: a0 = 32-bit unsigned integer.
# Output: a0 = number of leading zeros in x's binary representation
li t0, 32 # n = t0 = 32
li t1, 16 # c = t1 = 16
clz.loop:
srl t2, a0, t1 # y = t2 = x >> c
beq t2, zero, clz.skip # if (y == 0) goto clz.skip
sub t0, t0, t1 # n -= c
mv a0, t2 # x = y
clz.skip:
srli t1, t1, 1
bne t1, zero, clz.loop # while (c != 0) goto clz.loop
sub a0, t0, a0 # return n - x
ret # End of clz function
```
* **Unroll loop**
```assembly=
clz:
# Input: a0 = 32-bit unsigned integer.
# Output: a0 = number of leading zeros in x's binary representation
li t0, 32 # n = t0 = 32
srli t2, a0, 16 # y = t2 = x >> 16
beq t2, zero, clz.L_c8 # if (y == 0) goto clz.L_c8
addi t0, t0, -16 # n -= 16
mv a0, t2 # x = y
clz.L_c8:
srli t2, a0, 8 # y = t2 = x >> 8
beq t2, zero, clz.L_c4 # if (y == 0) goto clz.L_c4
addi t0, t0, -8 # n -= 8
mv a0, t2 # x = y
clz.L_c4:
srli t2, a0, 4 # y = t2 = x >> 4
beq t2, zero, clz.L_c2 # if (y == 0) goto clz.L_c2
addi t0, t0, -4 # n -= 4
mv a0, t2 # x = y
clz.L_c2:
srli t2, a0, 2 # y = t2 = x >> 2
beq t2, zero, clz.L_c1 # if (y == 0) goto .L_c1
addi t0, t0, -2 # n -= 2
mv a0, t2 # x = y
clz.L_c1:
srli t2, a0, 1 # y = t2 = x >> 1
beq t2, zero, clz.L_final # if (y == 0) goto clz.L_final
addi t0, t0, -1 # n -= 1
mv a0, t2 # x = y
clz.L_final:
sub a0, t0, a0 # return n - x
ret
```
### Code
:::spoiler **Modification** (Click to unfold)
```diff=
@@ -13,6 +13,7 @@
.text
+.globl main
# ======================================
# Function: main
# ======================================
@@ -25,13 +26,14 @@
lw ra, 0(sp) # Restore return address
addi sp, sp, 4 # Deallocate stack space
beq a0, zero, main.end # if (a0 == 0) goto main.end
- la a0, str5 # Load address of str5
- li a7, 4 # syscall: print string
+ li a7, 0x40 # syscall: write
+ li a0, 1 # stdout
+ la a1, str5 # Load address of str5
+ li a2, 18 # length = 18
ecall
main.end:
- li a7, 10 # exit code = 10
- ecall
+ ret
# ======================================
# Function: clz
@@ -137,70 +139,63 @@
test:
# Input: void
# Output: a0 = boolean (1 = pass, 0 = fail)
- li t0, -1 # previous_value = -1
- li t1, 1 # t1 = passed = 1
- li t2, 0 # i = 0
- li t3, 256 # max = 256
+ addi sp, sp, -20 # Allocate stack space
+ sw ra, 0(sp) # Save return address
+ sw s0, 4(sp) # Save previous_value
+ sw s1, 8(sp) # Save passed
+ sw s2, 12(sp) # Save i
+ sw s3, 16(sp) # Save max
+ li s0, -1 # previous_value = -1
+ li s1, 1 # s1 = passed = 1
+ li s2, 0 # s2 = i = 0
+ li s3, 256 # s3 = max = 256
test.loop:
- bge t2, t3, test.end # if (i >= max) goto end
- mv t4, t2 # fl = t4 = i
- mv a0, t4 # a0 = fl
- addi sp, sp, -24 # Allocate stack space
- sw ra, 0(sp) # Save return address
- sw t0, 4(sp) # Save previous_value
- sw t1, 8(sp) # Save passed
- sw t2, 12(sp) # Save i
- sw t3, 16(sp) # Save max
- sw t4, 20(sp) # Save fl
+ bge s2, s3, test.end # if (i >= max) goto end
+ mv a0, s2 # a0 = fl
jal uf8_decode # a0 = uf8_decode(fl)
mv t5, a0 # value = t5 = uf8_decode(fl)
jal uf8_encode # a0 = uf8_encode(value)
mv t6, a0 # fl2 = t6 = uf8_encode(value)
- lw t4, 20(sp) # Restore fl
- lw t3, 16(sp) # Restore max
- lw t2, 12(sp) # Restore i
- lw t1, 8(sp) # Restore passed
- lw t0, 4(sp) # Restore previous_value
- lw ra, 0(sp) # Restore return address
- addi sp, sp, 24 # Deallocate stack space
+ mv t4, s2 # fl = t4 = i
beq t4, t6, test.skip1 # if (fl == fl2) goto skip1
mv a0, t4 # a0 = fl
- li a7, 34 # syscall: print integer
- ecall
- la a0, str1 # Load address of str1
- li a7, 4 # syscall: print string
+ jal print_dec # print fl
+ li a7, 0x40 # syscall: write
+ li a0, 1 # stdout
+ la a1, str1 # Load address of str1
+ li a2, 17 # length = 17
ecall
mv a0, t5 # a0 = value
- li a7, 1 # syscall: print integer
- ecall
- la a0, str2 # Load address of str2
- li a7, 4 # syscall: print string
+ jal print_dec # print value
+ li a7, 0x40 # syscall: write
+ li a0, 1 # stdout
+ la a1, str2 # Load address of str2
+ li a2, 21 # length = 21
ecall
mv a0, t6 # a0 = fl2
- li a7, 34 # syscall: print integer
- ecall
- la a0, str6 # Load address of str6
- li a7, 4 # syscall: print string
+ jal print_dec # print fl2
+ li a7, 0x40 # syscall: write
+ li a0, 1 # stdout
+ la a1, str6 # Load address of str6
+ li a2, 1 # length = 1
ecall
- li t1, 0 # passed = 0
+ li s1, 0 # passed = 0
test.skip1:
- blt t0, t5, test.skip2 # if (previous_value < value) goto skip2
+ blt s0, t5, test.skip2 # if (previous_value < value) goto skip2
mv a0, t4 # a0 = fl
- li a7, 34 # syscall: print integer
- ecall
- la a0, str3 # Load address of str3
- li a7, 4 # syscall: print string
+ jal print_dec # print fl
+ li a7, 0x40 # syscall: write
+ li a0, 1 # stdout
+ la a1, str3 # Load address of str3
+ li a2, 8 # length = 8
ecall
mv a0, t5 # a0 = value
- li a7, 1 # syscall: print integer
- ecall
- la a0, str4 # Load address of str4
- li a7, 4 # syscall: print string
+ jal print_dec # print value
+ li a7, 0x40 # syscall: write
+ li a0, 1 # stdout
+ la a1, str4 # Load address of str4
+ li a2, 19 # length = 19
ecall
- mv a0, t0 # a0 = previous_value
- li a7, 1 # syscall: print integer
- ecall
- la a0, str6 # Load address of str6
- li a7, 4 # syscall: print string
+ mv a0, s0 # a0 = previous_value
+ jal print_dec # print previous_value
+ li a7, 0x40 # syscall: write
+ li a0, 1 # stdout
+ la a1, str6 # Load address of str6
+ li a2, 1 # length = 1
ecall
- li t1, 0 # passed = 0
+ li s1, 0 # passed = 0
+ mv a0, s1 # return passed
test.skip2:
- mv t0, t5 # previous_value = value
- addi t2, t2, 1 # i++
+ mv s0, t5 # previous_value = value
+ addi s2, s2, 1 # i++
j test.loop
test.end:
- mv a0, t1 # return passed
+ lw s3, 16(sp) # Restore max
+ lw s2, 12(sp) # Restore i
+ lw s1, 8(sp) # Restore passed
+ lw s0, 4(sp) # Restore previous_value
+ lw ra, 0(sp) # Restore return address
+ addi sp, sp, 20 # Deallocate stack space
ret # End of test function
```
:::
#### Output
```
18:04:40 INFO src/riscv.c:552: Log level: TRACE
18:04:40 INFO src/riscv.c:565: test.elf ELF loaded
18:04:40 INFO src/main.c:315: RISC-V emulator is created and ready to run
All tests passed.
18:04:40 INFO src/main.c:338: RISC-V emulator is destroyed
```
### Analysis
#### Program size
| Section | C code | Original Assembly | Optimized Assembly |
| ------- | ------ | ----------------- | ------------------ |
| text | 3300 | 2204 | 2256 |
| data | 0 | 122 | 122 |
| bss | 4108 | 4106 | 4102 |
| total | 7408 | 6432 | 6480 |
#### Performance
Clock cycles / Instruction count:
| C code | Original Assembly | Optimized Assembly |
| ------ | ----------------- | ------------------ |
| 71326 | 30290 | 27650 |
## Quiz2 Problem A
### Hanoi Tower
The Tower of Hanoi is a famous mathematical game and a classic example of recursive algorithms. It consists of three rods and several disks of different sizes. The objective of the game is to move all the disks from one rod to another, following these rules: only one disk can be moved at a time, and no larger disk may be placed on top of a smaller one.

### Algorithm: Gray Code
- The core of the program is non-recursive, utilizing an iterative solution to the 3-disk Tower of Hanoi problem. This design is critical for assembly implementation as it eliminates the overhead of recursive function calls and prevents potential stack overflow. The program runs a game_loop from 1 to 7. In each iteration, it uses a Gray Code algorithm to determine which disk to move. By relying on Gray Codes, complex recursive logic transforms into fast bitwise operations:
1. Calculate Current Gray(n): xor x6, x8, x5 (where n is in x8).
2. Calculate Previous Gray(n-1): xor x7, x7, x28.
3. Find Changed Bit: xor x5, x6, x7. The single set bit (value 1) in x5 indicates which disk number is to be moved.
4. Determine Disk: The code then uses andi and bne to check the 0th, 1st, and 2nd bits of x5 to set x9 to the correct disk number (0, 1, or 2).
- Calculating the Target Peg
- Smallest Disk (Disk 0): Always moves according to the pattern (current_position + 2) % 3.
- Larger Disks (Disk 1 or 2): Use the "sum of pegs" technique (0+1+2=3). The target peg B is calculated as 3 - source_peg A - other_disk_peg C.
### Code
#### RV32I Assembly
:::spoiler **Assembly code** (Click to unfold)
```asm=
.text
.globl main
main:
addi x2, x2, -36
sw x8, 0(x2)
sw x9, 4(x2)
sw x18, 8(x2)
sw x19, 12(x2)
sw x20, 16(x2)
li x5, 0x15
sw x5, 20(x2)
sw x5, 24(x2)
sw x5, 28(x2)
sw ra, 32(x2)
# Fix disk positions (BLANK 1-3: neutralize x5 effect)
# BLANK 1: Fix position at x2+20
sw x0, 20(x2)
# BLANK 2: Fix position at x2+24
sw x0, 24(x2)
# BLANK 3: Fix position at x2+28
sw x0, 28(x2)
addi x8, x0, 1
game_loop:
# BLANK 4: Check loop termination (2^3 moves)
addi x5, x0, 8
beq x8, x5, finish_game
# Gray code formula: gray(n) = n XOR (n >> k)
# BLANK 5: What is k for Gray code?
srli x5, x8, 1
# BLANK 6: Complete Gray(n) calculation
xor x6, x8, x5
# BLANK 7-8: Calculate previous value and its shift
addi x7, x8, -1
srli x28, x7, 1
# BLANK 9: Generate Gray(n-1)
xor x7, x7, x28
# BLANK 10: Which bits changed?
xor x5, x6, x7
# Initialize disk number
addi x9, x0, 0
# BLANK 11: Mask for testing LSB
andi x6, x5, 1
# BLANK 12: Branch if disk 0 moves
bne x6, x0, disk_found
# BLANK 13: Set disk 1
addi x9, x0, 1
# BLANK 14: Test second bit with proper mask
andi x6, x5, 2
bne x6, x0, disk_found
# BLANK 15: Last disk number
addi x9, x0, 2
disk_found:
# BLANK 16: Check impossible pattern (multiple bits)
andi x30, x5, 5
addi x31, x0, 5
beq x30, x31, pattern_match
jal x0, continue_move
pattern_match:
continue_move:
# BLANK 17: Word-align disk index (multiply by what?)
slli x5, x9, 2
# BLANK 18: Base offset for disk array
addi x5, x5, 20
add x5, x2, x5
lw x18, 0(x5)
bne x9, x0, handle_large
# BLANK 19: Small disk moves by how many positions?
addi x19, x18, 2
# BLANK 20: Number of pegs
addi x6, x0, 3
blt x19, x6, display_move
sub x19, x19, x6
jal x0, display_move
handle_large:
# BLANK 21: Load reference disk position
lw x6, 20(x2)
# BLANK 22: Sum of all peg indices (0+1+2)
addi x19, x0, 3
sub x19, x19, x18
sub x19, x19, x6
display_move:
la x20, obdata
add x5, x20, x18
lbu x11, 0(x5)
li x6, 0x6F
xor x11, x11, x6
addi x11, x11, -0x12
add x7, x20, x19
lbu x12, 0(x7)
xor x12, x12, x6
addi x12, x12, -0x12
mv t1, x11
mv t2, x12
li a7, 0x40
li a0, 1
la a1, str1
li a2, 10
ecall
addi x10, x9, 1
jal print_dec
li a0, 1
la a1, str2
li a2, 6
ecall
la t0, char_buffer
sb t1, 0(t0)
li a0, 1
mv a1, t0
li a2, 1
ecall
li a0, 1
la a1, str3
li a2, 4
ecall
la t0, char_buffer
sb t2, 0(t0)
li a0, 1
mv a1, t0
li a2, 1
ecall
li a0, 1
la a1, str4
li a2, 1
ecall
# BLANK 26: Calculate storage offset
slli x5, x9, 2
addi x5, x5, 20
add x5, x2, x5
# BLANK 27: Update disk position
sw x19, 0(x5)
# BLANK 28-29: Increment counter and loop
addi x8, x8, 1
jal x0, game_loop
finish_game:
lw x8, 0(x2)
lw x9, 4(x2)
lw x18, 8(x2)
lw x19, 12(x2)
lw x20, 16(x2)
lw ra, 32(x2)
addi x2, x2, 36
ret
.data
obdata: .byte 0x3c, 0x3b, 0x3a
str1: .asciz "Move Disk "
str2: .asciz " from "
str3: .asciz " to "
str4: .asciz "\n"
char_buffer: .space 1
```
:::
### Optimization
```Problem_A.S (Original)``` contains a game_loop: structure that iterates 7 times (for counter x8 from 1 to 7). It uses a beq to check for termination (x8 == 8) and a jal to jump back to the start.
```Problem_A_optim.S (Optimized)``` completely removes this loop. It implements loop unrolling by pasting the entire loop body 7 times in a row, allowing the code to execute linearly without any branching or jump instructions.
:::spoiler **Modification** (Click to unfold)
```diff=
--- a/Problem_A.S
+++ b/Problem_A_optim.S
@@ -20,31 +20,13 @@
sw ra, 32(x2)
# Fix disk positions (BLANK 1-3: neutralize x5 effect)
- # BLANK 1: Fix position at x2+20
sw x0, 20(x2)
- # BLANK 2: Fix position at x2+24
sw x0, 24(x2)
- # BLANK 3: Fix position at x2+28
sw x0, 28(x2)
addi x8, x0, 1
-game_loop:
- # BLANK 4: Check loop termination (2^3 moves)
- addi x5, x0, 8
- beq x8, x5, finish_game
-
- # Gray code formula: gray(n) = n XOR (n >> k)
- # BLANK 5: What is k for Gray code?
srli x5, x8, 1
- # BLANK 6: Complete Gray(n) calculation
xor x6, x8, x5
- # BLANK 7-8: Calculate previous value and its shift
addi x7, x8, -1
srli x28, x7, 1
- # BLANK 9: Generate Gray(n-1)
xor x7, x7, x28
- # BLANK 10: Which bits changed?
xor x5, x6, x7
# Initialize disk number
addi x9, x0, 0
- # BLANK 11: Mask for testing LSB
andi x6, x5, 1
- # BLANK 12: Branch if disk 0 moves
- bne x6, x0, disk_found
-
- # BLANK 13: Set disk 1
+ bne x6, x0, disk_found_1
+
addi x9, x0, 1
- # BLANK 14: Test second bit with proper mask
andi x6, x5, 2
- bne x6, x0, disk_found
-
- # BLANK 15: Last disk number
+ bne x6, x0, disk_found_1
+
addi x9, x0, 2
-disk_found:
- # BLANK 16: Check impossible pattern (multiple bits)
+disk_found_1:
andi x30, x5, 5
addi x31, x0, 5
- beq x30, x31, pattern_match
- jal x0, continue_move
-pattern_match:
-continue_move:
-
- # BLANK 17: Word-align disk index (multiply by what?)
+ beq x30, x31, pattern_match_1
+ jal x0, continue_move_1
+pattern_match_1:
+continue_move_1:
+
slli x5, x9, 2
- # BLANK 18: Base offset for disk array
- addi x5, x5, 20
- add x5, x2, x5
- lw x18, 0(x5)
-
- bne x9, x0, handle_large
-
- # BLANK 19: Small disk moves by how many positions?
- addi x19, x18, 2
-
- # BLANK 20: Number of pegs
- addi x6, x0, 3
- blt x19, x6, display_move
+ addi x5, x5, 20
+ add x5, x2, x5
+ lw x18, 0(x5)
+
+ bne x9, x0, handle_large_1
+
+ addi x19, x18, 2
+
+ addi x6, x0, 3
+ blt x19, x6, display_move_1
sub x19, x19, x6
- jal x0, display_move
-
-handle_large:
- # BLANK 21: Load reference disk position
+ jal x0, display_move_1
+
+handle_large_1:
lw x6, 20(x2)
- # BLANK 22: Sum of all peg indices (0+1+2)
addi x19, x0, 3
sub x19, x19, x18
sub x19, x19, x6
-display_move:
+display_move_1:
la x20, obdata
add x5, x20, x18
lbu x11, 0(x5)
@@ -141,20 +123,803 @@
li a2, 1
ecall
- # BLANK 26: Calculate storage offset
- slli x5, x9, 2
- addi x5, x5, 20
- add x5, x2, x5
-
- # BLANK 27: Update disk position
+ slli x5, x9, 2
+ addi x5, x5, 20
+ add x5, x2, x5
+
sw x19, 0(x5)
- # BLANK 28-29: Increment counter and loop
addi x8, x8, 1
- jal x0, game_loop
-
-finish_game:
+
+ srli x5, x8, 1
+
+ xor x6, x8, x5
+
+ addi x7, x8, -1
+ srli x28, x7, 1
+
+ xor x7, x7, x28
+
+ xor x5, x6, x7
+
+ addi x9, x0, 0
+
+ andi x6, x5, 1
+
+ bne x6, x0, disk_found_2
+
+ addi x9, x0, 1
+
+ andi x6, x5, 2
+ bne x6, x0, disk_found_2
+
+ addi x9, x0, 2
+
+disk_found_2:
+ andi x30, x5, 5
+ addi x31, x0, 5
+ beq x30, x31, pattern_match_2
+ jal x0, continue_move_2
+pattern_match_2:
+continue_move_2:
+
+ slli x5, x9, 2
+
+ addi x5, x5, 20
+ add x5, x2, x5
+ lw x18, 0(x5)
+
+ bne x9, x0, handle_large_2
+
+ addi x19, x18, 2
+
+ addi x6, x0, 3
+ blt x19, x6, display_move_2
+ sub x19, x19, x6
+ jal x0, display_move_2
+
+handle_large_2:
+ lw x6, 20(x2)
+
+ addi x19, x0, 3
+ sub x19, x19, x18
+ sub x19, x19, x6
+
+display_move_2:
+ la x20, obdata
+ add x5, x20, x18
+ lbu x11, 0(x5)
+ li x6, 0x6F
+ xor x11, x11, x6
+ addi x11, x11, -0x12
+
+ add x7, x20, x19
+ lbu x12, 0(x7)
+ xor x12, x12, x6
+ addi x12, x12, -0x12
+
+ mv t1, x11
+ mv t2, x12
+ li a7, 0x40
+ li a0, 1
+ la a1, str1
+ li a2, 10
+ ecall
+
+ addi x10, x9, 1
+ jal print_dec
+
+ li a0, 1
+ la a1, str2
+ li a2, 6
+ ecall
+
+ la t0, char_buffer
+ sb t1, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str3
+ li a2, 4
+ ecall
+
+ la t0, char_buffer
+ sb t2, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str4
+ li a2, 1
+ ecall
+
+ slli x5, x9, 2
+ addi x5, x5, 20
+ add x5, x2, x5
+
+ sw x19, 0(x5)
+
+ addi x8, x8, 1
+
+ srli x5, x8, 1
+
+ xor x6, x8, x5
+
+ addi x7, x8, -1
+ srli x28, x7, 1
+
+ xor x7, x7, x28
+
+ xor x5, x6, x7
+
+ addi x9, x0, 0
+
+ andi x6, x5, 1
+
+ bne x6, x0, disk_found_3
+
+ addi x9, x0, 1
+
+ andi x6, x5, 2
+ bne x6, x0, disk_found_3
+
+ addi x9, x0, 2
+
+disk_found_3:
+ andi x30, x5, 5
+ addi x31, x0, 5
+ beq x30, x31, pattern_match_3
+ jal x0, continue_move_3
+pattern_match_3:
+continue_move_3:
+
+ slli x5, x9, 2
+
+ addi x5, x5, 20
+ add x5, x2, x5
+ lw x18, 0(x5)
+
+ bne x9, x0, handle_large_3
+
+ addi x19, x18, 2
+
+ addi x6, x0, 3
+ blt x19, x6, display_move_3
+ sub x19, x19, x6
+ jal x0, display_move_3
+
+handle_large_3:
+ lw x6, 20(x2)
+
+ addi x19, x0, 3
+ sub x19, x19, x18
+ sub x19, x19, x6
+
+display_move_3:
+ la x20, obdata
+ add x5, x20, x18
+ lbu x11, 0(x5)
+ li x6, 0x6F
+ xor x11, x11, x6
+ addi x11, x11, -0x12
+
+ add x7, x20, x19
+ lbu x12, 0(x7)
+ xor x12, x12, x6
+ addi x12, x12, -0x12
+
+ mv t1, x11
+ mv t2, x12
+ li a7, 0x40
+ li a0, 1
+ la a1, str1
+ li a2, 10
+ ecall
+
+ addi x10, x9, 1
+ jal print_dec
+
+ li a0, 1
+ la a1, str2
+ li a2, 6
+ ecall
+
+ la t0, char_buffer
+ sb t1, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str3
+ li a2, 4
+ ecall
+
+ la t0, char_buffer
+ sb t2, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str4
+ li a2, 1
+ ecall
+
+ slli x5, x9, 2
+ addi x5, x5, 20
+ add x5, x2, x5
+
+ sw x19, 0(x5)
+
+ addi x8, x8, 1
+
+ srli x5, x8, 1
+
+ xor x6, x8, x5
+
+ addi x7, x8, -1
+ srli x28, x7, 1
+
+ xor x7, x7, x28
+
+ xor x5, x6, x7
+
+ addi x9, x0, 0
+
+ andi x6, x5, 1
+
+ bne x6, x0, disk_found_4
+
+ addi x9, x0, 1
+
+ andi x6, x5, 2
+ bne x6, x0, disk_found_4
+
+ addi x9, x0, 2
+
+disk_found_4:
+ andi x30, x5, 5
+ addi x31, x0, 5
+ beq x30, x31, pattern_match_4
+ jal x0, continue_move_4
+pattern_match_4:
+continue_move_4:
+
+ slli x5, x9, 2
+
+ addi x5, x5, 20
+ add x5, x2, x5
+ lw x18, 0(x5)
+
+ bne x9, x0, handle_large_4
+
+ addi x19, x18, 2
+
+ addi x6, x0, 3
+ blt x19, x6, display_move_4
+ sub x19, x19, x6
+ jal x0, display_move_4
+
+handle_large_4:
+ lw x6, 20(x2)
+
+ addi x19, x0, 3
+ sub x19, x19, x18
+ sub x19, x19, x6
+
+display_move_4:
+ la x20, obdata
+ add x5, x20, x18
+ lbu x11, 0(x5)
+ li x6, 0x6F
+ xor x11, x11, x6
+ addi x11, x11, -0x12
+
+ add x7, x20, x19
+ lbu x12, 0(x7)
+ xor x12, x12, x6
+ addi x12, x12, -0x12
+
+ mv t1, x11
+ mv t2, x12
+ li a7, 0x40
+ li a0, 1
+ la a1, str1
+ li a2, 10
+ ecall
+
+ addi x10, x9, 1
+ jal print_dec
+
+ li a0, 1
+ la a1, str2
+ li a2, 6
+ ecall
+
+ la t0, char_buffer
+ sb t1, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str3
+ li a2, 4
+ ecall
+
+ la t0, char_buffer
+ sb t2, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str4
+ li a2, 1
+ ecall
+
+ slli x5, x9, 2
+ addi x5, x5, 20
+ add x5, x2, x5
+
+ sw x19, 0(x5)
+
+ addi x8, x8, 1
+
+ srli x5, x8, 1
+
+ xor x6, x8, x5
+
+ addi x7, x8, -1
+ srli x28, x7, 1
+
+ xor x7, x7, x28
+
+ xor x5, x6, x7
+
+ addi x9, x0, 0
+
+ andi x6, x5, 1
+
+ bne x6, x0, disk_found_5
+
+ addi x9, x0, 1
+
+ andi x6, x5, 2
+ bne x6, x0, disk_found_5
+
+ addi x9, x0, 2
+
+disk_found_5:
+ andi x30, x5, 5
+ addi x31, x0, 5
+ beq x30, x31, pattern_match_5
+ jal x0, continue_move_5
+pattern_match_5:
+continue_move_5:
+
+ slli x5, x9, 2
+
+ addi x5, x5, 20
+ add x5, x2, x5
+ lw x18, 0(x5)
+
+ bne x9, x0, handle_large_5
+
+ addi x19, x18, 2
+
+ addi x6, x0, 3
+ blt x19, x6, display_move_5
+ sub x19, x19, x6
+ jal x0, display_move_5
+
+handle_large_5:
+ lw x6, 20(x2)
+
+ addi x19, x0, 3
+ sub x19, x19, x18
+ sub x19, x19, x6
+
+display_move_5:
+ la x20, obdata
+ add x5, x20, x18
+ lbu x11, 0(x5)
+ li x6, 0x6F
+ xor x11, x11, x6
+ addi x11, x11, -0x12
+
+ add x7, x20, x19
+ lbu x12, 0(x7)
+ xor x12, x12, x6
+ addi x12, x12, -0x12
+
+ mv t1, x11
+ mv t2, x12
+ li a7, 0x40
+ li a0, 1
+ la a1, str1
+ li a2, 10
+ ecall
+
+ addi x10, x9, 1
+ jal print_dec
+
+ li a0, 1
+ la a1, str2
+ li a2, 6
+ ecall
+
+ la t0, char_buffer
+ sb t1, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str3
+ li a2, 4
+ ecall
+
+ la t0, char_buffer
+ sb t2, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str4
+ li a2, 1
+ ecall
+
+ slli x5, x9, 2
+ addi x5, x5, 20
+ add x5, x2, x5
+
+ sw x19, 0(x5)
+
+ addi x8, x8, 1
+
+ srli x5, x8, 1
+
+ xor x6, x8, x5
+
+ addi x7, x8, -1
+ srli x28, x7, 1
+
+ xor x7, x7, x28
+
+ xor x5, x6, x7
+
+ addi x9, x0, 0
+
+ andi x6, x5, 1
+
+ bne x6, x0, disk_found_6
+
+ addi x9, x0, 1
+
+ andi x6, x5, 2
+ bne x6, x0, disk_found_6
+
+ addi x9, x0, 2
+
+disk_found_6:
+ andi x30, x5, 5
+ addi x31, x0, 5
+ beq x30, x31, pattern_match_6
+ jal x0, continue_move_6
+pattern_match_6:
+continue_move_6:
+
+ slli x5, x9, 2
+
+ addi x5, x5, 20
+ add x5, x2, x5
+ lw x18, 0(x5)
+
+ bne x9, x0, handle_large_6
+
+ addi x19, x18, 2
+
+ addi x6, x0, 3
+ blt x19, x6, display_move_6
+ sub x19, x19, x6
+ jal x0, display_move_6
+
+handle_large_6:
+ lw x6, 20(x2)
+
+ addi x19, x0, 3
+ sub x19, x19, x18
+ sub x19, x19, x6
+
+display_move_6:
+ la x20, obdata
+ add x5, x20, x18
+ lbu x11, 0(x5)
+ li x6, 0x6F
+ xor x11, x11, x6
+ addi x11, x11, -0x12
+
+ add x7, x20, x19
+ lbu x12, 0(x7)
+ xor x12, x12, x6
+ addi x12, x12, -0x12
+
+ mv t1, x11
+ mv t2, x12
+ li a7, 0x40
+ li a0, 1
+ la a1, str1
+ li a2, 10
+ ecall
+
+ addi x10, x9, 1
+ jal print_dec
+
+ li a0, 1
+ la a1, str2
+ li a2, 6
+ ecall
+
+ la t0, char_buffer
+ sb t1, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str3
+ li a2, 4
+ ecall
+
+ la t0, char_buffer
+ sb t2, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str4
+ li a2, 1
+ ecall
+
+ slli x5, x9, 2
+ addi x5, x5, 20
+ add x5, x2, x5
+
+ sw x19, 0(x5)
+
+ addi x8, x8, 1
+
+ srli x5, x8, 1
+
+ xor x6, x8, x5
+
+ addi x7, x8, -1
+ srli x28, x7, 1
+
+ xor x7, x7, x28
+
+ xor x5, x6, x7
+
+ addi x9, x0, 0
+
+ andi x6, x5, 1
+
+ bne x6, x0, disk_found_7
+
+ addi x9, x0, 1
+
+ andi x6, x5, 2
+ bne x6, x0, disk_found_7
+
+ addi x9, x0, 2
+
+disk_found_7:
+ andi x30, x5, 5
+ addi x31, x0, 5
+ beq x30, x31, pattern_match_7
+ jal x0, continue_move_7
+pattern_match_7:
+continue_move_7:
+
+ slli x5, x9, 2
+
+ addi x5, x5, 20
+ add x5, x2, x5
+ lw x18, 0(x5)
+
+ bne x9, x0, handle_large_7
+
+ addi x19, x18, 2
+
+ addi x6, x0, 3
+ blt x19, x6, display_move_7
+ sub x19, x19, x6
+ jal x0, display_move_7
+
+handle_large_7:
+ lw x6, 20(x2)
+
+ addi x19, x0, 3
+ sub x19, x19, x18
+ sub x19, x19, x6
+
+display_move_7:
+ la x20, obdata
+ add x5, x20, x18
+ lbu x11, 0(x5)
+ li x6, 0x6F
+ xor x11, x11, x6
+ addi x11, x11, -0x12
+
+ add x7, x20, x19
+ lbu x12, 0(x7)
+ xor x12, x12, x6
+ addi x12, x12, -0x12
+
+ mv t1, x11
+ mv t2, x12
+ li a7, 0x40
+ li a0, 1
+ la a1, str1
+ li a2, 10
+ ecall
+
+ addi x10, x9, 1
+ jal print_dec
+
+ li a0, 1
+ la a1, str2
+ li a2, 6
+ ecall
+
+ la t0, char_buffer
+ sb t1, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str3
+ li a2, 4
+ ecall
+
+ la t0, char_buffer
+ sb t2, 0(t0)
+ li a0, 1
+ mv a1, t0
+ li a2, 1
+ ecall
+
+ li a0, 1
+ la a1, str4
+ li a2, 1
+ ecall
+
+ slli x5, x9, 2
+ addi x5, x5, 20
+ add x5, x2, x5
+
+ sw x19, 0(x5)
+
+ addi x8, x8, 1
+
+finish_game:
lw x8, 0(x2)
lw x9, 4(x2)
lw x18, 8(x2)
:::
#### Output:
```
18:10:31 INFO src/riscv.c:552: Log level: TRACE
18:10:31 INFO src/riscv.c:565: test.elf ELF loaded
18:10:31 INFO src/main.c:315: RISC-V emulator is created and ready to run
Move Disk 1 from A to C
Move Disk 2 from A to B
Move Disk 1 from C to B
Move Disk 3 from A to C
Move Disk 1 from B to A
Move Disk 2 from B to C
Move Disk 1 from A to C
18:10:31 INFO src/main.c:338: RISC-V emulator is destroyed
```
### Analysis
#### Program size
| Section | Original Assembly | Optimized Assembly |
| ------- | ----------------- | ------------------ |
| text | 2004 | 4056 |
| data | 61 | 61 |
| bss | 4111 | 4107 |
| total | 6176 | 8224 |
#### Performance
Clock cycle / Instruction count
| Column 1 | Column 2 |
| -------- | -------- |
| 9277 | 9254 |
## Quiz3 Problem C
### Fast Inverse Square Root

The algorithm is based on **Newton's method**, which is used to approximate the solution for $y = 1/\sqrt{x}$.
A key feature of this code is that it **does not rely on a Floating-Point Unit (FPU)**. Instead, it uses 32-bit integer and **fixed-point** arithmetic to perform all calculations.
---
### Operational Flow
1. **Initial Guess**: The `fast_rsqrt` function first calls `clz` (Count Leading Zeros) to estimate the magnitude (exponent) of the input value `x`. It uses this exponent as an index to fetch a rough initial guess from a pre-computed lookup table (`rsqrt_table`).
2. **Interpolation**: To improve the accuracy of the initial guess, the program then performs **linear interpolation** between two adjacent values in the `rsqrt_table` to obtain a more precise starting `y` value.
3. **Newton's Method Iteration**: The program then executes **two** iterations of Newton's method in the `fast_rsqrt.loop`.
* The iteration formula it applies is: $y_{n+1} = y_n \left( \frac{3 - xy_n^2}{2} \right)$
* All operations are simulated using fixed-point. For example, the constant `3` is represented as `3 << 16`.
* Multiplication (`y*y` and `x*y^2`) is accomplished by calling the custom `mul32` function (a 32x32 -> 64 integer multiplication) combined with bit shifts.
* The "divide by 2" in the formula is ultimately implemented via a right shift (`srli ... 17`).
### Code
#### C code
:::spoiler **Problem_C.c** (Click to unfold)
```c
# include <stdint.h>
# include <stdlib.h>
# include <stdbool.h>
# include "print.h"
#define REC_INV_SQRT_CACHE (16)
static const uint32_t inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
~0U, ~0U, 3037000500, 2479700525,
2147483647, 1920767767, 1753413056, 1623345051,
1518500250, 1431655765, 1358187914, 1294981364,
1239850263, 1191209601, 1147878294, 1108955788
};
static const uint16_t rsqrt_table[32] = {
0, 46341, 32768, 23170, 16384, /* 2^0 to 2^4 */
11585, 8192, 5793, 4096, 2896, /* 2^5 to 2^9 */
2048, 1448, 1024, 724, 512, /* 2^10 to 2^14 */
362, 256, 181, 128, 90, /* 2^15 to 2^19 */
64, 45, 32, 23, 16, /* 2^20 to 2^24 */
11, 8, 6, 4, 3, /* 2^25 to 2^29 */
2, 1, /* 2^30 to 2^31 */
};
/*
* Newton iteration: new_y = y * (3/2 - x * y^2 / 2)
* Here, y is a Q0.32 fixed-point number (< 1.0)
*/
static void newton_step(uint32_t *rec_inv_sqrt, uint32_t x){
uint32_t invsqrt, invsqrt2;
uint64_t val;
invsqrt = *rec_inv_sqrt; /* Dereference pointer */
invsqrt2 = ((uint64_t)invsqrt * invsqrt) >> 32;
val = (3LL << 32) - ((uint64_t)x * invsqrt2);
val >>= 2; /* Avoid overflow in following multiply */
val = (val * invsqrt) >> 31; /* Right shift by 31 = (32 - 2 + 1) */
*rec_inv_sqrt = (uint32_t)val;
}
static int clz(uint32_t x){
if(!x)return 32;
int n = 0;
if(!(x & 0xFFFF0000)) {n += 16; x <<= 16;}
if(!(x & 0xFF000000)) {n += 8; x <<= 8;}
if(!(x & 0xF0000000)) {n += 4; x <<= 4;}
if(!(x & 0xC0000000)) {n += 2; x <<= 2;}
if(!(x & 0x80000000)) {n += 1;}
return n;
}
static uint64_t mul32(uint32_t a, uint32_t b){
uint64_t r=0;
for(int i=0; i<32; i++){
if(b & (1U<<i))
r += (uint64_t)a<<i;
}
return r;
}
static uint64_t mul32_split(uint32_t a, uint32_t b){
uint32_t r_lo = 0;
uint32_t r_hi = 0;
for(int i = 0; i < 32; i++){
if(b & (1U << i)){
uint32_t add_lo = a << i;
uint32_t add_hi = (i == 0) ? 0 : (a >> (32 - i));
uint32_t prev_lo = r_lo;
r_lo += add_lo;
if (r_lo < prev_lo) {
r_hi += 1;
}
r_hi += add_hi;
}
}
return ((uint64_t)r_hi << 32) | r_lo;
}
uint32_t fast_rsqrt(uint32_t x){
if(x==0) return 0xffffffff;
if(x==1) return 65536;
int exp = 31 - clz(x);
uint32_t y = rsqrt_table[exp];
if(x > (1U << exp)){
uint32_t y_next = (exp < 31)? rsqrt_table[exp+1]: 0;
uint32_t delta = y - y_next;
uint32_t frac = (uint32_t) ((((uint64_t)x - (1UL << exp)) << 16) >> exp);
y -= (uint32_t) ((delta * frac) >> 16);
}
for (int iter = 0; iter < 2; iter++){
uint32_t y2 = (uint32_t)mul32(y, y);
uint32_t xy2 = (uint32_t)(mul32(x, y2) >> 16);
y = (uint32_t)(mul32(y, (3u << 16) - xy2) >> 17);
}
return y;
}
int main() {
uint32_t result = fast_rsqrt(5);
print_dec(result);
printstr("\n", 1);
return 0;
}
```
:::
:::info
The linking step was changed from $(LD) to $(CC) (which is gcc). This is because C code requires gcc to automatically link the C runtime libraries.
Add -nostartfiles flag. This flag tells gcc not to use its default startup files, and instead, to use the custom start.S in the project as the program's entry point.
```diff
--- a/Makefile
+++ b/Makefile_forC
@@ -8,7 +8,7 @@
AFLAGS = -g $(ARCH)
CFLAGS = -g -march=rv32i_zicsr
-LDFLAGS = -T $(LINKER_SCRIPT)
+LDFLAGS = -T $(LINKER_SCRIPT) -nostartfiles
EXEC = test.elf
CC = $(CROSS_COMPILE)gcc
@@ -16,13 +16,13 @@
LD = $(CROSS_COMPILE)ld
OBJDUMP = $(CROSS_COMPILE)objdump
...
$(EXEC): $(OBJS) $(LINKER_SCRIPT)
- $(LD) $(LDFLAGS) -o $@ $(OBJS)
+ $(CC) $(LDFLAGS) -o $@ $(OBJS)
%.o: %.S
$(AS) $(AFLAGS) $< -o $@
```
:::
#### RV32I Assembly
:::spoiler **rsqrt.S** (Click to unfold)
```asm=
.text
.globl main
main:
addi sp, sp, -4
sw ra, 0(sp)
li a7, 0x40 # syscall: write
li a0, 1 # file descriptor (stdout)
la a1, str1 # Load address of
li a2, 23 # length
ecall
la t0, test_data
lw a0, 0(t0)
jal print_dec
li a0, 1
la a1, str2 # Load address of
li a2, 6 # length
ecall
la a0, test_data
lw a0, 0(a0)
jal fast_rsqrt
jal print_dec
li a0, 1
la a1, str3 # Load address of
li a2, 18 # length
ecall
lw ra, 0(sp)
addi sp, sp, 4
ret
# ============================================================
# function: fast_rsqrt
# ============================================================
newton_step:
# Input: a0 = *rec_inv_sqrt (uint32_t pointer)
# a1 = x (uint32_t)
addi sp, sp, -16
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
sw s2, 12(sp)
mv s0, a0 # s0 = rec_inv_sqrt
mv s1, a1 # s1 = x
lw s2, 0(s0) # s2 = *rec_inv_sqrt
mv a0, s2 # a0 = invsqrt
mv a1, s2 # a1 = invsqrt
jal ra, mul32 # invsqrt^2, a1 = invsqrt2 = y^2_hi = invsqrt^2 >> 32
mv a0, s1 # a0 = x
jal ra, mul32 # prod = x * invsqrt2
li t0, 3
beqz a0, newton_step.zero_prod_lo
sub a0, zero, a0 # a0 = val_lo = 0 - prod_lo
addi t0, t0, -1 # t0 = 2
newton_step.zero_prod_lo:
sub a1, t0, a1 # a1 = val_hi = 3 - prod_hi
andi t1, a1, 0x3 # t1 = val_hi & 0x3, lower 2 bits
slli t1, t1, 30 # t1 = (val_hi & 0x3) << 30
srli a0, a0, 2 # a0 = val_lo >> 2
or a0, a0, t1 # a0 = (val_hi & 0x3) << 30 | (val_lo >> 2)
srli a1, a1, 2 # a1 = val_hi >> 2
mv s1, a1 # s1 = val_hi
mv a1, s2 # a1 = invsqrt
jal ra, mul32 # new_val_lo = val * invsqrt
mv t0, a0 # a0 = new_val_lo_lo
mv a0, s1 # a0 = val_hi
mv s1, a1 # s1 = new_val_lo_hi
mv a1, s2 # a1 = invsqrt
mv s2, t0 # s2 = new_val_lo_lo
jal ra, mul32 # new_val_hi = invsqrt * val_hi
add a0, s1, a0 # a0 = new_val_lo_hi + new_val_hi_lo
slli a0, a0, 1
srli s2, s2, 31 # new_val_lo_lo >> 31
or a0, a0, s2 # a0 = new_val_hi + (new_val_lo_lo >> 31)
sw a0, 0(s0) # *rec_inv_sqrt = new_val_lo
lw ra, 0(sp)
lw s0, 4(sp)
lw s1, 8(sp)
lw s2, 12(sp)
addi sp, sp, 16
ret
# ============================================================
# function: clz
# ============================================================
clz:
# Input: a0 = x (uint32_t)
# Output: a0 = clz(x) (uint32_t)
bnez a0, clz.not_zero
li a0, 32 # return 32 for x == 0
ret
clz.not_zero:
li t0, 0 # n = 0
lui t1, 0xFFFF0 # mask = 0xFFFF0000
and t2, a0, t1
bnez t2, clz.skip1
addi t0, t0, 16 # n += 16
slli a0, a0, 16
clz.skip1:
lui t1, 0xFF000 # mask = 0xFF000000
and t2, a0, t1
bnez t2, clz.skip2
addi t0, t0, 8 # n += 8
slli a0, a0, 8
clz.skip2:
lui t1, 0xF0000 # mask = 0xF0000000
and t2, a0, t1
bnez t2, clz.skip3
addi t0, t0, 4 # n += 4
slli a0, a0, 4
clz.skip3:
lui t1, 0xC0000 # mask = 0xC0000000
and t2, a0, t1
bnez t2, clz.skip4
addi t0, t0, 2 # n += 2
slli a0, a0, 2
clz.skip4:
lui t1, 0x80000 # mask = 0x80000000
and t2, a0, t1
bnez t2, clz.end
addi t0, t0, 1 # n += 1
clz.end:
mv a0, t0 # return n
ret
# ============================================================
# function: mul32_split
# ============================================================
mul32:
# Input: a0 = a (uint32_t)
# a1 = b (uint32_t)
# Output: a0 = r_lo (low 32), a1 = r_hi (high 32)
li t0, 0 # i = 0
li t1, 32 # constant 32
li t2, 0 # r_lo = 0
li t3, 0 # r_hi = 0
li t4, 1 # bitmask = 1
mul32.loop:
bge t0, t1, mul32.end # if i >= 32, end loop
sll t5, t4, t0 # bitmask = 1 << i
and t5, a1, t5 # check if b's i-th bit
beqz t5, mul32.skip1 # if bit is 0, skip
sll t5, a0, t0 # add_lo = a << i
li t6,0
beqz t0, mul32.skip2 # if i == 0, skip
li t6, 32
sub t6, t6, t0 # 32 - i
srl t6, a0, t6 # add_hi = a >> (32 - i)
mul32.skip2:
add t2, t2, t5 # r_lo += add_lo
add t3, t3, t6 # r_hi += add_hi
bgeu t2, t5, mul32.skip1 # if no overflow, skip carry
addi t3, t3, 1 # if overflow, add carry to high part
mul32.skip1:
addi t0, t0, 1 # i++
j mul32.loop
mul32.end:
mv a0, t2 # return r_lo
mv a1, t3 # return r_hi
ret
# ============================================================
# function: fast_rsqrt
# ============================================================
fast_rsqrt:
# Input: a0 = x (uint32_t)
# Output: a0 = fast_rsqrt(x) (uint32_t)
bnez a0, fast_rsqrt.not_zero
li a0, 0xFFFFFFFF # return 0xFFFFFFFF for x ==
ret
fast_rsqrt.not_zero:
li t0, 1
bne a0, t0, fast_rsqrt.not_one
li a0, 65536 # return 65536 for x == 1
ret
fast_rsqrt.not_one:
addi sp, sp, -20
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
sw s2, 12(sp)
sw s3, 16(sp)
mv s0, a0 # s0 = x
jal ra, clz
li t0, 31
sub s1, t0, a0 # s1 = exp = 31 - clz(x)
la s2, rsqrt_table
slli t0, s1, 1 # t0 = exp * 2
add t0, t0, s2 # t0 = &rsqrt_table[exp]
lhu s3, 0(t0) # s3 = y
li t0, 1 # t0 = 1
sll t0, t0, s1 # t0 = 1 << exp
bge t0, s0, fast_rsqrt.skip1
li t0, 31
li t1, 0 # t1 = y_next = 0
bge s1, t0, fast_rsqrt.skip1_1
addi t1, s1, 1
slli t1, t1, 1 # t1 = (exp + 1) * 2
add t1, t1, s2 # t1 = &rsqrt_table[exp + 1]
lhu t1, 0(t1) # t1 = rsqrt_table[exp + 1]
fast_rsqrt.skip1_1:
sub t2, s3, t1 # t2 = delta = y - y_next
li t0, 1
sll t0, t0, s1 # t0 = 1 << exp
sub t0, s0, t0 # t0 = x - (1 << exp)
slli t0, t0, 16 # t0 = (x - (1 << exp)) << 16
srl t0, t0, s1 # t0 = frac = ((x - (1 << exp)) << 16)/(1 << exp)
mv a0, t2 # a0 = delta
mv a1, t0 # a1 = frac
jal ra, mul32 # prod = delta * frac
srli t0, a0, 16 # t0 = prod_lo >> 16
slli t1, a1, 16 # t1 = prod_hi << 16
or t0, t1, t0 # a0 = prod_hi << 16 | (prod_lo >> 16)
sub s3, s3, t0 # y -= prod >> 16
fast_rsqrt.skip1:
li s1, 0 # s1 = i = 0
li s2, 2 # s2 = 2
fast_rsqrt.loop:
bge s1, s2, fast_rsqrt.end_loop
mv a0, s3 # a0 = y
mv a1, s3 # a1 = y
jal ra, mul32 # y^2, a0 = y^2_lo, a1 = y^2_hi
or a1, a1, a0 # a1 = y2 = y^2 >> 16
mv a0, s0 # a0 = x
jal ra, mul32 # prod = x * y2
slli a1, a1, 16
srli a0, a0, 16
or a1, a1, a0 # a1 = xy2 = x * y2 >> 16
li t0, 3
slli t0, t0, 16 # t0 = 3 << 16
sub a1, t0, a1
mv a0, s3 # a0 = y
jal ra, mul32 # new_y = y * (3 << 16 - xy2)
srli a0, a0, 17 # new_y_lo >> 17
slli a1, a1, 15 # new_y_hi << 15
or s3, a1, a0 # s3 = new_y_hi << 15 | (new_y_lo >> 17)
addi s1, s1, 1 # i++
j fast_rsqrt.loop
fast_rsqrt.end_loop:
mv a0, s3 # return y
lw s3, 16(sp)
lw s2, 12(sp)
lw s1, 8(sp)
lw s0, 4(sp)
lw ra, 0(sp)
addi sp, sp, 20
ret
# ============================================================
# Data Section
# ============================================================
.data
inv_sqrt_cache: .word 0xFFFFFFFF, 0xFFFFFFFF, 3037000500, 2479700525
.word 2147483647, 1920767767, 1753413056, 1623345051
.word 1518500250, 1431655765, 1358187914, 1294981364
.word 1264197512, 1220703125, 1181116006, 1145324612
rsqrt_table: .half 65535, 46341, 32768, 23170, 16384
.half 11585, 8192, 5793, 4096, 2896
.half 2048, 1448, 1024, 724, 512
.half 362, 256, 181, 128, 90
.half 64, 45, 32, 23, 16
.half 11, 8, 6, 4, 3
.half 2, 1
test_data: .word 4
str1: .asciz "Reverse Square Root of "
str2: .asciz " is: "
str3: .asciz " in fp32 encoding\n"
```
:::
### Optimization
1. ```mul32 Function (32x32 multiplication)```:
In the original version (rsqrt.S), mul32 contained a loop (mul32.loop) that iterated 32 times.
In the optimized version, this loop has been fully unrolled. The author removed all loop-control instructions (bge, j) and sequentially pasted all 32 copies of the loop body (from i = 0 to i = 31).
2. ```fast_rsqrt Function (Newton's Method Iteration)```:
In the original version, fast_rsqrt.loop was a loop that executed 2 times for the Newton's method iteration.
In the optimized version, this loop was also unrolled. The author duplicated the iteration code twice, allowing it to execute as linear, sequential code.
:::spoiler **Modification** (Click to unfold)
```diff=
--- a/rsqrt.S
+++ b/rsqrt_optim.S
@@ -107,32 +107,317 @@
# ============================================================
# function: mul32_split
# ============================================================
mul32:
# Input: a0 = a (uint32_t)
# a1 = b (uint32_t)
# Output: a0 = r_lo (low 32), a1 = r_hi (high 32)
- li t0, 0 # i = 0
- li t1, 32 # constant 32
li t2, 0 # r_lo = 0
li t3, 0 # r_hi = 0
li t4, 1 # bitmask = 1
-mul32.loop:
- bge t0, t1, mul32.end # if i >= 32, end loop
- sll t5, t4, t0 # bitmask = 1 << i
- and t5, a1, t5 # check if b's i-th bit
- beqz t5, mul32.skip1 # if bit is 0, skip
- sll t5, a0, t0 # add_lo = a << i
- li t6,0
- beqz t0, mul32.skip2 # if i == 0, skip
- li t6, 32
- sub t6, t6, t0 # 32 - i
- srl t6, a0, t6 # add_hi = a >> (32 - i)
-mul32.skip2:
- add t2, t2, t5 # r_lo += add_lo
- add t3, t3, t6 # r_hi += add_hi
- bgeu t2, t5, mul32.skip1 # if no overflow, skip carry
- addi t3, t3, 1 # if overflow, add carry to high part
-mul32.skip1:
- addi t0, t0, 1 # i++
- j mul32.loop
+
+# for i = 0
+ slli t5, t4, 0 # bitmask = 1 << 0
+ and t5, a1, t5 # check if b's 0-th bit
+ beqz t5, mul32.skip1_0 # if bit is 0, skip
+ slli t5, a0, 0 # add_lo = a << 0
+ li t6, 0 # add_hi = 0
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_0 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_0:
+
+# for i = 1
+ slli t5, t4, 1 # bitmask = 1 << 1
+ and t5, a1, t5 # check if b's 1-th bit
+ beqz t5, mul32.skip1_1 # if bit is 0, skip
+ slli t5, a0, 1 # add_lo = a << 1
+ li t6, 31 # 32 - 1
+ srl t6, a0, t6 # add_hi = a >> (32 - 1)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_1 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_1:
+
+# for i = 2
+ slli t5, t4, 2 # bitmask = 1 << 2
+ and t5, a1, t5 # check if b's 2-th bit
+ beqz t5, mul32.skip1_2 # if bit is 0, skip
+ slli t5, a0, 2 # add_lo = a << 2
+ li t6, 30 # 32 - 2
+ srl t6, a0, t6 # add_hi = a >> (32 - 2)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_2 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_2:
+
+# for i = 3
+ slli t5, t4, 3 # bitmask = 1 << 3
+ and t5, a1, t5 # check if b's 3-th bit
+ beqz t5, mul32.skip1_3 # if bit is 0, skip
+ slli t5, a0, 3 # add_lo = a << 3
+ li t6, 29 # 32 - 3
+ srl t6, a0, t6 # add_hi = a >> (32 - 3)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_3 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_3:
+
+# for i = 4
+ slli t5, t4, 4 # bitmask = 1 << 4
+ and t5, a1, t5 # check if b's 4-th bit
+ beqz t5, mul32.skip1_4 # if bit is 0, skip
+ slli t5, a0, 4 # add_lo = a << 4
+ li t6, 28 # 32 - 4
+ srl t6, a0, t6 # add_hi = a >> (32 - 4)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_4 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_4:
+
+# for i = 5
+ slli t5, t4, 5 # bitmask = 1 << 5
+ and t5, a1, t5 # check if b's 5-th bit
+ beqz t5, mul32.skip1_5 # if bit is 0, skip
+ slli t5, a0, 5 # add_lo = a << 5
+ li t6, 27 # 32 - 5
+ srl t6, a0, t6 # add_hi = a >> (32 - 5)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_5 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_5:
+
+# for i = 6
+ slli t5, t4, 6 # bitmask = 1 << 6
+ and t5, a1, t5 # check if b's 6-th bit
+ beqz t5, mul32.skip1_6 # if bit is 0, skip
+ slli t5, a0, 6 # add_lo = a << 6
+ li t6, 26 # 32 - 6
+ srl t6, a0, t6 # add_hi = a >> (32 - 6)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_6 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_6:
+
+# for i = 7
+ slli t5, t4, 7 # bitmask = 1 << 7
+ and t5, a1, t5 # check if b's 7-th bit
+ beqz t5, mul32.skip1_7 # if bit is 0, skip
+ slli t5, a0, 7 # add_lo = a << 7
+ li t6, 25 # 32 - 7
+ srl t6, a0, t6 # add_hi = a >> (32 - 7)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_7 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_7:
+
+# for i = 8
+ slli t5, t4, 8 # bitmask = 1 << 8
+ and t5, a1, t5 # check if b's 8-th bit
+ beqz t5, mul32.skip1_8 # if bit is 0, skip
+ slli t5, a0, 8 # add_lo = a << 8
+ li t6, 24 # 32 - 8
+ srl t6, a0, t6 # add_hi = a >> (32 - 8)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_8 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_8:
+
+# for i = 9
+ slli t5, t4, 9 # bitmask = 1 << 9
+ and t5, a1, t5 # check if b's 9-th bit
+ beqz t5, mul32.skip1_9 # if bit is 0, skip
+ slli t5, a0, 9 # add_lo = a << 9
+ li t6, 23 # 32 - 9
+ srl t6, a0, t6 # add_hi = a >> (32 - 9)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_9 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_9:
+
+# for i = 10
+ slli t5, t4, 10 # bitmask = 1 << 10
+ and t5, a1, t5 # check if b's 10-th bit
+ beqz t5, mul32.skip1_10 # if bit is 0, skip
+ slli t5, a0, 10 # add_lo = a << 10
+ li t6, 22 # 32 - 10
+ srl t6, a0, t6 # add_hi = a >> (32 - 10)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_10 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_10:
+
+# for i = 11
+ slli t5, t4, 11 # bitmask = 1 << 11
+ and t5, a1, t5 # check if b's 11-th bit
+ beqz t5, mul32.skip1_11 # if bit is 0, skip
+ slli t5, a0, 11 # add_lo = a << 11
+ li t6, 21 # 32 - 11
+ srl t6, a0, t6 # add_hi = a >> (32 - 11)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_11 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_11:
+
+# for i = 12
+ slli t5, t4, 12 # bitmask = 1 << 12
+ and t5, a1, t5 # check if b's 12-th bit
+ beqz t5, mul32.skip1_12 # if bit is 0, skip
+ slli t5, a0, 12 # add_lo = a << 12
+ li t6, 20 # 32 - 12
+ srl t6, a0, t6 # add_hi = a >> (32 - 12)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_12 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_12:
+
+# for i = 13
+ slli t5, t4, 13 # bitmask = 1 << 13
+ and t5, a1, t5 # check if b's 13-th bit
+ beqz t5, mul32.skip1_13 # if bit is 0, skip
+ slli t5, a0, 13 # add_lo = a << 13
+ li t6, 19 # 32 - 13
+ srl t6, a0, t6 # add_hi = a >> (32 - 13)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_13 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_13:
+
+# for i = 14
+ slli t5, t4, 14 # bitmask = 1 << 14
+ and t5, a1, t5 # check if b's 14-th bit
+ beqz t5, mul32.skip1_14 # if bit is 0, skip
+ slli t5, a0, 14 # add_lo = a << 14
+ li t6, 18 # 32 - 14
+ srl t6, a0, t6 # add_hi = a >> (32 - 14)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_14 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_14:
+
+# for i = 15
+ slli t5, t4, 15 # bitmask = 1 << 15
+ and t5, a1, t5 # check if b's 15-th bit
+ beqz t5, mul32.skip1_15 # if bit is 0, skip
+ slli t5, a0, 15 # add_lo = a << 15
+ li t6, 17 # 32 - 15
+ srl t6, a0, t6 # add_hi = a >> (32 - 15)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_15 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_15:
+
+# for i = 16
+ slli t5, t4, 16 # bitmask = 1 << 16
+ and t5, a1, t5 # check if b's 16-th bit
+ beqz t5, mul32.skip1_16 # if bit is 0, skip
+ slli t5, a0, 16 # add_lo = a << 16
+ li t6, 16 # 32 - 16
+ srl t6, a0, t6 # add_hi = a >> (32 - 16)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_16 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_16:
+
+# for i = 17
+ slli t5, t4, 17 # bitmask = 1 << 17
+ and t5, a1, t5 # check if b's 17-th bit
+ beqz t5, mul32.skip1_17 # if bit is 0, skip
+ slli t5, a0, 17 # add_lo = a << 17
+ li t6, 15 # 32 - 17
+ srl t6, a0, t6 # add_hi = a >> (32 - 17)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_17 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_17:
+
+# for i = 18
+ slli t5, t4, 18 # bitmask = 1 << 18
+ and t5, a1, t5 # check if b's 18-th bit
+ beqz t5, mul32.skip1_18 # if bit is 0, skip
+ slli t5, a0, 18 # add_lo = a << 18
+ li t6, 14 # 32 - 18
+ srl t6, a0, t6 # add_hi = a >> (32 - 18)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_18 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_18:
+
+# for i = 19
+ slli t5, t4, 19 # bitmask = 1 << 19
+ and t5, a1, t5 # check if b's 19-th bit
+ beqz t5, mul32.skip1_19 # if bit is 0, skip
+ slli t5, a0, 19 # add_lo = a << 19
+ li t6, 13 # 32 - 19
+ srl t6, a0, t6 # add_hi = a >> (32 - 19)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_19 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_19:
+
+# for i = 20
+ slli t5, t4, 20 # bitmask = 1 << 20
+ and t5, a1, t5 # check if b's 20-th bit
+ beqz t5, mul32.skip1_20 # if bit is 0, skip
+ slli t5, a0, 20 # add_lo = a << 20
+ li t6, 12 # 32 - 20
+ srl t6, a0, t6 # add_hi = a >> (32 - 20)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_20 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_20:
+
+# for i = 21
+ slli t5, t4, 21 # bitmask = 1 << 21
+ and t5, a1, t5 # check if b's 21-th bit
+ beqz t5, mul32.skip1_21 # if bit is 0, skip
+ slli t5, a0, 21 # add_lo = a << 21
+ li t6, 11 # 32 - 21
+ srl t6, a0, t6 # add_hi = a >> (32 - 21)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_21 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_21:
+
+# for i = 22
+ slli t5, t4, 22 # bitmask = 1 << 22
+ and t5, a1, t5 # check if b's 22-th bit
+ beqz t5, mul32.skip1_22 # if bit is 0, skip
+ slli t5, a0, 22 # add_lo = a << 22
+ li t6, 10 # 32 - 22
+ srl t6, a0, t6 # add_hi = a >> (32 - 22)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_22 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_22:
+
+# for i = 23
+ slli t5, t4, 23 # bitmask = 1 << 23
+ and t5, a1, t5 # check if b's 23-th bit
+ beqz t5, mul32.skip1_23 # if bit is 0, skip
+ slli t5, a0, 23 # add_lo = a << 23
+ li t6, 9 # 32 - 23
+ srl t6, a0, t6 # add_hi = a >> (32 - 23)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_23 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_23:
+
+# for i = 24
+ slli t5, t4, 24 # bitmask = 1 << 24
+ and t5, a1, t5 # check if b's 24-th bit
+ beqz t5, mul32.skip1_24 # if bit is 0, skip
+ slli t5, a0, 24 # add_lo = a << 24
+ li t6, 8 # 32 - 24
+ srl t6, a0, t6 # add_hi = a >> (32 - 24)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_24 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_24:
+
+# for i = 25
+ slli t5, t4, 25 # bitmask = 1 << 25
+ and t5, a1, t5 # check if b's 25-th bit
+ beqz t5, mul32.skip1_25 # if bit is 0, skip
+ slli t5, a0, 25 # add_lo = a << 25
+ li t6, 7 # 32 - 25
+ srl t6, a0, t6 # add_hi = a >> (32 - 25)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_25 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_25:
+
+# for i = 26
+ slli t5, t4, 26 # bitmask = 1 << 26
+ and t5, a1, t5 # check if b's 26-th bit
+ beqz t5, mul32.skip1_26 # if bit is 0, skip
+ slli t5, a0, 26 # add_lo = a << 26
+ li t6, 6 # 32 - 26
+ srl t6, a0, t6 # add_hi = a >> (32 - 26)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_26 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_26:
+
+# for i = 27
+ slli t5, t4, 27 # bitmask = 1 << 27
+ and t5, a1, t5 # check if b's 27-th bit
+ beqz t5, mul32.skip1_27 # if bit is 0, skip
+ slli t5, a0, 27 # add_lo = a << 27
+ li t6, 5 # 32 - 27
+ srl t6, a0, t6 # add_hi = a >> (32 - 27)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_27 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_27:
+
+# for i = 28
+ slli t5, t4, 28 # bitmask = 1 << 28
+ and t5, a1, t5 # check if b's 28-th bit
+ beqz t5, mul32.skip1_28 # if bit is 0, skip
+ slli t5, a0, 28 # add_lo = a << 28
+ li t6, 4 # 32 - 28
+ srl t6, a0, t6 # add_hi = a >> (32 - 28)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_28 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_28:
+
+# for i = 29
+ slli t5, t4, 29 # bitmask = 1 << 29
+ and t5, a1, t5 # check if b's 29-th bit
+ beqz t5, mul32.skip1_29 # if bit is 0, skip
+ slli t5, a0, 29 # add_lo = a << 29
+ li t6, 3 # 32 - 29
+ srl t6, a0, t6 # add_hi = a >> (32 - 29)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_29 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_29:
+
+# for i = 30
+ slli t5, t4, 30 # bitmask = 1 << 30
+ and t5, a1, t5 # check if b's 30-th bit
+ beqz t5, mul32.skip1_30 # if bit is 0, skip
+ slli t5, a0, 30 # add_lo = a << 30
+ li t6, 2 # 32 - 30
+ srl t6, a0, t6 # add_hi = a >> (32 - 30)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_30 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_30:
+
+# for i = 31
+ slli t5, t4, 31 # bitmask = 1 << 31
+ and t5, a1, t5 # check if b's 31-th bit
+ beqz t5, mul32.skip1_31 # if bit is 0, skip
+ slli t5, a0, 31 # add_lo = a << 31
+ li t6, 1 # 32 - 31
+ srl t6, a0, t6 # add_hi = a >> (32 - 31)
+ add t2, t2, t5 # r_lo += add_lo
+ add t3, t3, t6 # r_hi += add_hi
+ bgeu t2, t5, mul32.skip1_31 # if no overflow, skip carry
+ addi t3, t3, 1 # if overflow, add carry to high part
+mul32.skip1_31:
+
mul32.end:
mv a0, t2 # return r_lo
mv a1, t3 # return r_hi
@@ -196,10 +481,7 @@
slli t0, t0, 16 # t0 = (x - (1 << exp)) << 16
srl t0, t0, s1 # t0 = frac = ((x - (1 << exp)) << 16)/(1 << exp)
mv a0, t2 # a0 = delta
- mv a1, t0 # a1 = frac
- jal ra, mul32 # prod = delta * frac
- srli t0, a0, 16 # t0 = prod_lo >> 16
- slli t1, a1, 16 # t1 = prod_hi << 16
+ mv a1, t0 # a1 = frac
+ jal ra, mul32 # prod = delta * frac
+ srli t0, a0, 16 # t0 = prod_lo >> 16
+ slli t1, a1, 16 # t1 = prod_hi << 16
or t0, t1, t0 # a0 = prod_hi << 16 | (prod_lo >> 16)
sub s3, s3, t0 # y -= prod >> 16
fast_rsqrt.skip1:
- li s1, 0 # s1 = i = 0
- li s2, 2 # s2 = 2
-fast_rsqrt.loop:
- bge s1, s2, fast_rsqrt.end_loop
mv a0, s3 # a0 = y
mv a1, s3 # a1 = y
jal ra, mul32 # y^2, a0 = y^2_lo, a1 = y^2_hi
@@ -217,9 +499,22 @@
srli a0, a0, 17 # new_y_lo >> 17
slli a1, a1, 15 # new_y_hi << 15
or s3, a1, a0 # s3 = new_y_hi << 15 | (new_y_lo >> 17)
- addi s1, s1, 1 # i++
- j fast_rsqrt.loop
-fast_rsqrt.end_loop:
+ mv a0, s3 # a0 = y
+ mv a1, s3 # a1 = y
+ jal ra, mul32 # y^2, a0 = y^2_lo, a1 = y^2_hi
+ or a1, a1, a0 # a1 = y2 = y^2 >> 16
+ mv a0, s0 # a0 = x
+ jal ra, mul32 # prod = x * y2
+ slli a1, a1, 16
+ srli a0, a0, 16
+ or a1, a1, a0 # a1 = xy2 = x * y2 >> 16
+ li t0, 3
+ slli t0, t0, 16 # t0 = 3 << 16
+ sub a1, t0, a1
+ mv a0, s3 # a0 = y
+ jal ra, mul32 # new_y = y * (3 << 16 - xy2)
+ srli a0, a0, 17 # new_y_lo >> 17
+ slli a1, a1, 15 # new_y_hi << 15
+ or s3, a1, a0 # s3 = new_y_hi << 15 | (new_y_lo >> 17)
+
+fast_rsqrt.end_loop:
mv a0, s3 # return y
lw s3, 16(sp)
lw s2, 12(sp)
```
:::
#### Output:
```
17:26:19 INFO src/riscv.c:552: Log level: TRACE
17:26:19 INFO src/riscv.c:565: test.elf ELF loaded
17:26:19 INFO src/main.c:315: RISC-V emulator is created and ready to run
Reverse Square Root of 19 is: 15034 in fp32 encoding
17:26:19 INFO src/main.c:338: RISC-V emulator is destroyed
```
### Analysis
#### Program size
| Section | C code | Original Assembly | Optimized Assembly |
| ------- | ------ | ----------------- | ------------------ |
| text | 4125 | 2380 | 3632 |
| data | 0 | 214 | 214 |
| bss | 4099 | 4110 | 4106 |
| total | 8224 | 6704 | 7952 |
#### Performance
- best case(exception): fast_rsqrt(0)
- best case(normal): fast_rsqrt(2)
- worst case: fast_rsqrt(3)
Clock cycles / Instruction count:
| Cases | C code | Original Assembly | Optimized Assembly |
| --------------- | ------ | ----------------- | ------------------ |
| Best(exception) | 64 | 22 | 22 |
| Best(normal) | 3381 | 1564 | 885 |
| Worst | 4561 | 2111 | 1225 |
#### Precision
The script iterates through all integer values of x from 1 to 65535. It compares the "approximate value" (y_approx) derived from the table against the "exact value" (y_exact) computed using math.sqrt(), and then calculates the relative error between them.
- Maximum Relative Error: **0.414192**
- Average Relative Error: **0.218814**

:::spoiler **Code** (Click to unfold)
```python=
import math
import numpy as np
import matplotlib.pyplot as plt
rsqrt_table = np.array([65535, 46341, 32768, 23170, 16384, 11585, 8192, 5793, 4096, 2896,
2048, 1448, 1024, 724, 512, 362, 256, 181, 128, 90,
64, 45, 32, 23, 16, 11, 8, 6, 4, 3, 2, 1], dtype=np.uint16)
def compute_initial_error(x):
if x == 0:
return np.nan
clz = 32 - (int(x).bit_length() if x > 0 else 0)
exp = 31 - clz
if exp < 0 or exp >= 32:
raise ValueError(f"Invalid exp {exp} for x={x}")
y_init = rsqrt_table[exp]
y_approx = y_init / 65536.0
y_exact = 1.0 / math.sqrt(x)
return abs(y_approx - y_exact) / y_exact
x_values = np.arange(1, 0x10000) # 1 to 65535
errors = np.array([compute_initial_error(x) for x in x_values])
max_err = np.nanmax(errors)
min_err = np.nanmin(errors)
avg_err = np.nanmean(errors)
print(f"Maximum Relative Error: {max_err:.6f}")
print(f"Minimum Relative Error: {min_err:.6f}")
print(f"Average Relative Error: {avg_err:.6f}")
plt.figure(figsize=(10, 6))
plt.plot(x_values, errors, label='Relative Error', color='blue', alpha=0.7)
plt.xscale('log')
plt.xlabel('x Value (log scale)')
plt.ylabel('Relative Error')
plt.title('Distribution of Initial Relative Error in Fast Rsqrt Guess (x=1 to 65535)')
plt.grid(True)
plt.legend()
plt.savefig('rsqrt_initial_error_plot.png')
plt.show()
```
:::
## References
* [Lab2: RISC-V Instruction Set Simulator and System Emulator](https://hackmd.io/@sysprog/Sko2Ja5pel#Setup)
* [riscv-none-elf-gcc-xpack](https://github.com/xpack-dev-tools/riscv-none-elf-gcc-xpack)
* [rv32emu](https://github.com/sysprog21/rv32emu)
* [Approximating sine function in BF16 data type](https://hackmd.io/@Max042004/bf16_sin)