# Assignment2: Complete Applications
Contributed by [liangchingyun](https://github.com/liangchingyun)
## HW1 Program Adaptation
> Pick one complete program (with test suite) done in your [Homework 1](https://hackmd.io/@sysprog/2025-arch-homework1) and make it run in a **bare-metal** environment using rv32emu’s system emulation.
> * There are just **RV32I** instructions that can be used. This means that you MUST build C programs with the `-march=rv32izicsr` or `-march=rv32i_zicsr` flags.
> * RV32M (multiply and divide) and RV32F (single-precision floating point) are not permitted.
>* [rv32emu](https://github.com/sysprog21/rv32emu) and [Ripes](https://github.com/mortbopet/Ripes) may not work together, therefore please be aware of the potential incompatibility. Check [docs/syscall.md](https://github.com/sysprog21/rv32emu/blob/master/docs/syscall.md) and [src/syscall.c](https://github.com/sysprog21/rv32emu/blob/master/src/syscall.c) in advance.
>* Do not duplicate workspaces or the entire repository from [rv32emu](https://github.com/sysprog21/rv32emu). As a starting point, copy the `tests/system/playground directory` instead. You shall modify `Makefile` and the linker script accordingly.
“Bare‑metal” refers to computer hardware without any **installed operating system** or other software, where instructions are executed directly on the underlying hardware logic.
Change the following lines in the `Makefile` to local‐path equivalents:
```
include ../../../mk/toolchain.mk
EMU ?= ../../../build/rv32emu
@grep -q "ENABLE_ELF_LOADER=1" ../../../build/.config || (echo "Error: ENABLE_ELF_LOADER=1 not set" && exit 1)
@grep -q "ENABLE_SYSTEM=1" ../../../build/.config || (echo "Error: ENABLE_SYSTEM=1 not set" && exit 1)
```
```
$ make
$ make run
```
### C code
```c
/* Decode uf8 to uint32_t */
uint32_t uf8_decode(uf8 fl)
{
uint32_t mantissa = fl & 0x0f; // low 4 bits
uint8_t exponent = fl >> 4; // high 4 bits
uint32_t offset = (0x7FFF >> (15 - exponent)) << 4; // (2^e - 1)·16
return (mantissa << exponent) + offset;
}
```
```c
/* Encode uint32_t to uf8 */
uf8 uf8_encode(uint32_t value)
{
/* Use CLZ for fast exponent calculation */
if (value < 16) // If less than 16, return directly → no compression needed.
return value;
/* Find appropriate exponent using CLZ hint */
int lz = clz(value); // count leading zeros
int msb = 31 - lz;
/* Start from a good initial guess */
uint8_t exponent = 0;
uint32_t overflow = 0;
if (msb >= 5) {
/* Estimate exponent - the formula is empirical */
exponent = msb - 4; // Initial estimate
if (exponent > 15)
exponent = 15;
/* Calculate overflow for estimated exponent */ // Interval start value
for (uint8_t e = 0; e < exponent; e++)
overflow = (overflow << 1) + 16; // overflow = offset(e) = (2^e−1)⋅16
/* Adjust if estimate was off */
while (exponent > 0 && value < overflow) { // value < overflow → exponent guessed too large.
overflow = (overflow - 16) >> 1;
exponent--;
}
}
/* Find exact exponent */
while (exponent < 15) {
uint32_t next_overflow = (overflow << 1) + 16;
if (value < next_overflow) // value falls within the current exponent’s interval → no need to increase the exponent.
break;
overflow = next_overflow;
exponent++; // exponent too small.
}
uint8_t mantissa = (value - overflow) >> exponent;
return (exponent << 4) | mantissa;
}
```
###
### Original Test Suite
```c
printf("The encoded value of 108 is %d \n", uf8_encode(108));
printf("The encoded value of 816 is %d \n", uf8_encode(816));
printf("The encoded value of 524272 is %d \n", uf8_encode(524272));
printf("The decoded value of 0x2F is %d \n", uf8_decode(0x2F));
printf("The decoded value of 0x5A is %d \n", uf8_decode(0x5A));
printf("The decoded value of 0xF0 is %d \n", uf8_decode(0xF0));
```
```c
static bool test(void)
{
int32_t previous_value = -1;
bool passed = true;
for (int i = 0; i < 256; i++) {
uint8_t fl = i;
int32_t value = uf8_decode(fl);
uint8_t fl2 = uf8_encode(value);
if (fl != fl2) {
printf("%02x: produces value %d but encodes back to %02x\n", fl,
value, fl2);
passed = false;
}
if (value <= previous_value) {
printf("%02x: value %d <= previous_value %d\n", fl, value,
previous_value);
passed = false;
}
previous_value = value;
}
return passed;
}
```
```
The encoded value of 108 is 47
The encoded value of 816 is 90
The encoded value of 524272 is 240
The decoded value of 0x2F is 108
The decoded value of 0x5A is 816
The decoded value of 0xF0 is 524272
All tests passed.
```
### Bare-Metal Test Suite
```c
uf8 test_vals[] = {0x2F, 0x5A, 0xF0};
uint32_t decoded;
uf8 encoded;
for (int i = 0; i < sizeof(test_vals)/sizeof(test_vals[0]); i++) {
TEST_LOGGER("Original uf8: ");
print_dec(test_vals[i]);
decoded = uf8_decode(test_vals[i]);
TEST_LOGGER("Decoded value: ");
print_dec(decoded);
encoded = uf8_encode(decoded);
TEST_LOGGER("Encoded back: ");
print_dec(encoded);
TEST_LOGGER("\n");
}
```
```c
int passed = 1;
for (int i = 0; i < 256; i++) {
uint32_t val = uf8_decode(i);
uf8 fl2 = uf8_encode(val);
if (i != fl2) {
TEST_LOGGER("Mismatch at uf8 = ");
print_dec(i);
TEST_LOGGER("\n");
passed = 0;
}
}
if (passed) {
TEST_LOGGER("All tests passed.");
TEST_LOGGER("\n");
}
```
```
Original uf8: 47
Decoded value: 108
Encoded back: 47
Original uf8: 90
Decoded value: 816
Encoded back: 90
Original uf8: 240
Decoded value: 524272
Encoded back: 240
All tests passed.
```
## Quiz Adaptation
>Adapt Problem A from [Quiz 2](https://hackmd.io/@sysprog/arch2025-quiz2) and Problem C from [Quiz 3](https://hackmd.io/@sysprog/arch2025-quiz3), and make them run in a bare-metal environment using rv32emu’s system emulation. Refer to [Approximating sine function in BF16 data](https://hackmd.io/@Max042004/bf16_sin) type for guidance on analyzing precision and performance. Always measure and refine your implementation based on profiling results.
>Disassemble the ELF files produced by the C compiler and contrast the handwritten and compiler-optimized assembly listings.
>* You can append the compilation options to experiment. e.g., Change `-Ofast` (optimized for speed) to `-Os` (optimized for size).
>* Describe your obserations and explain.
### Problem A from [Quiz 2](https://hackmd.io/@sysprog/arch2025-quiz2)
#### Original code
```riscv
.text
.globl main
main:
addi x2, x2, -32 # stack pointer
sw x8, 0(x2) # x8 = step
sw x9, 4(x2) # x9 = idx of moving disk
sw x18, 8(x2) # x18 = disk 1 location
sw x19, 12(x2) # x19 = disk 2 location
sw x20, 16(x2) # x20 = disk 3 location
li x5, 0x15
sw x5, 20(x2)
sw x5, 24(x2)
sw x5, 28(x2)
# Fix disk positions
sw x0, 20(x2) # Fix position at x2+20
sw x0, 24(x2) # Fix position at x2+24
sw x0, 28(x2) # Fix position at x2+28
addi x8, x0, 1
game_loop:
# Check loop termination (2^3 moves)
addi x5, x0, 8
beq x8, x5, finish_game
# Calculate Gray(n)
srli x5, x8, 1
xor x6, x8, x5 # x6 = Gray(n)
# Calculate Gray(n-1)
addi x7, x8, -1
srli x28, x7, 1
xor x7, x7, x28 # x7 = Gray(n-1)
# Find changed bit
xor x5, x6, x7
# Find moving disk number
addi x9, x0, 0 # x9 = disk number
andi x6, x5, 1
bne x6, x0, disk_found # if LSB = 1 -> disk 1 moves
addi x9, x0, 1
andi x6, x5, 2
bne x6, x0, disk_found # if Second LSB = 1 -> disk 2 moves
addi x9, x0, 2 # else -> disk 3 moves
disk_found:
#Check impossible pattern (multiple bits)
andi x30, x5, 5
addi x31, x0, 5
beq x30, x31, pattern_match
jal x0, continue_move
pattern_match:
continue_move:
slli x5, x9, 2 # Each disk’s position occupies 4 bytes → multiply by 4.
addi x5, x5, 20
add x5, x2, x5
lw x18, 0(x5) # x18 = peg of current disk
bne x9, x0, handle_large # Branch if not small disk
addi x19, x18, 2 # Small disk moves by 2 positions
addi x6, x0, 3 # x6 = 3
blt x19, x6, display_move
sub x19, x19, x6 # round if > 3
jal x0, display_move
handle_large:
lw x6, 20(x2) # x6 = peg of small disk
addi x19, x0, 3
sub x19, x19, x18
sub x19, x19, x6 # x19 = peg of large disk
display_move:
la x20, obdata
add x5, x20, x18
# Load byte from obfuscated data
lbu x11, 0(x5)
# Decode constant (0x6F)
li x6, 0x6F
xor x11, x11, x6
# Final offset adjustment
addi x11, x11, -0x12
add x7, x20, x19
lbu x12, 0(x7)
xor x12, x12, x6
addi x12, x12, -0x12
la x10, str1
addi x17, x0, 4
ecall
addi x10, x9, 1
addi x17, x0, 1
ecall
la x10, str2
addi x17, x0, 4
ecall
addi x10, x11, 0
addi x17, x0, 11
ecall
la x10, str3
addi x17, x0, 4
ecall
addi x10, x12, 0
addi x17, x0, 11
ecall
addi x10, x0, 10
addi x17, x0, 11
ecall
# Calculate storage offset
slli x5, x9, 2
addi x5, x5, 20
add x5, x2, x5
# Update disk position
sw x19, 0(x5)
# Increment counter and loop
addi x8, x8, 1
jal x0, game_loop
finish_game:
lw x8, 0(x2)
lw x9, 4(x2)
lw x18, 8(x2)
lw x19, 12(x2)
lw x20, 16(x2)
addi x2, x2, 32
addi x17, x0, 10
ecall
.data
obdata: .byte 0x3c, 0x3b, 0x3a
str1: .asciz "Move Disk "
str2: .asciz " from "
str3: .asciz " to "
```
#### Adapted code
```
$ source $HOME/riscv-none-elf-gcc/setenv
$ make clean
$ make
$ make run
```
1.
```
riscv-none-elf-ld: hanoi_tower.o: can't link double-float modules with soft-float modules
```
This indicates that `hanoi_tower.o` was compiled with a hard‑float ABI, while the other files (`start.o` and `perfcounter.o`) use a soft‑float ABI, and the two cannot be linked together. Modify the `Makefile` to ensure that all `.s` / `.S` files are assembled and linked with the soft-float setting:
```Makefile
%.o: %.s
$(AS) $(AFLAGS) $< -o $@
```
2.
```
16:32:47 FATAL src/syscall.c:538: Unknown syscall: 4
16:32:47 FATAL src/syscall.c:538: Unknown syscall: 1
16:32:47 FATAL src/syscall.c:538: Unknown syscall: 4
16:32:47 FATAL src/syscall.c:538: Unknown syscall: 11
```
Change to a syscall number supported by rv32emu.
```c
#define SUPPORTED_SYSCALLS \
_(close, 57) \
_(lseek, 62) \
_(read, 63) \
_(write, 64) \
_(fstat, 80) \
_(exit, 93) \
_(gettimeofday, 169) \
_(brk, 214) \
_(clock_gettime, 403) \
_(open, 1024) \
IIF(RV32_HAS(SYSTEM))( \
_(sbi_base, 0x10) \
_(sbi_timer, 0x54494D45) \
_(sbi_rst, 0x53525354) \
) \
IIF(RV32_HAS(SDL))( \
_(draw_frame, 0xBEEF) \
_(setup_queue, 0xC0DE) \
_(submit_queue, 0xFEED) \
_(setup_audio, 0xBABE) \
_(control_audio, 0xD00D) \
)
```
**write**
* `a7 = 64`
* `a0 = 1`: stdout (standard output, i.e., screen).
* `a1`: the memory address of the data to be output.
* `a2`: the number of bytes to write.
* Print `a2` characters from the address pointed to by `a1`
Replace the code under `display_move`:
```riscv
la x10, str1
addi x17, x0, 4
ecall
addi x10, x9, 1
addi x17, x0, 1
ecall
la x10, str2
addi x17, x0, 4
ecall
addi x10, x11, 0
addi x17, x0, 11
ecall
la x10, str3
addi x17, x0, 4
ecall
addi x10, x12, 0
addi x17, x0, 11
ecall
addi x10, x0, 10
addi x17, x0, 11
ecall
```
with:
```riscv
# print "Move Disk "
li a7, 64
li a0, 1
la a1, str1
li a2, 10
ecall
# print disk num
addi t0, x9, 1 # 1,2,3
addi t0, t0, '0' # '1','2','3'
la a1, tmp_buf
sb t0, 0(a1)
li a7, 64
li a0, 1
li a2, 1
ecall
# print " from "
li a7, 64
li a0, 1
la a1, str2
li a2, 6
ecall
# print source peg
addi t0, x18, 'A'
la a1, tmp_buf
sb t0, 0(a1)
li a7, 64
li a0, 1
li a2, 1
ecall
# print " to "
li a7, 64
li a0, 1
la a1, str3
li a2, 4
ecall
# print target peg
addi t0, x19, 'A'
la a1, tmp_buf
sb t0, 0(a1)
li a7, 64
li a0, 1
li a2, 1
ecall
# newline
li a7, 64
li a0, 1
la a1, tmp_buf
li t0, 10
sb t0, 0(a1)
li a2, 1
ecall
```
### Problem C from [Quiz 3](https://hackmd.io/@sysprog/arch2025-quiz3)
#### C code
```c
static uint64_t mul32(uint32_t a, uint32_t b) {
uint64_t r = 0;
for (int i = 0; i < 32; i++) {
if (b & (1U << i))
r += (uint64_t)a << i;
}
return r;
}
static const uint16_t rsqrt_table[32] = {
65536, 46341, 32768, 23170, 16384, /* 2^0 to 2^4 */
11585, 8192, 5793, 4096, 2896, /* 2^5 to 2^9 */
2048, 1448, 1024, 724, 512, /* 2^10 to 2^14 */
362, 256, 181, 128, 90, /* 2^15 to 2^19 */
64, 45, 32, 23, 16, /* 2^20 to 2^24 */
11, 8, 6, 4, 3, /* 2^25 to 2^29 */
2, 1 /* 2^30, 2^31 */
};
uint32_t fast_rsqrt(uint32_t x) {
if (x == 0) return 0xFFFFFFFF;
if (x == 1) return 65536;
int exp = 31 - clz(x);
uint32_t = rsqrt_table[exp];
if (x > (1u << exp)) {
uint32_t y_next = (exp < 31) ? rsqrt_table[exp + 1] : 0;
uint32_t delta = y - y_next;
uint32_t frac = (uint32_t) ((((uint64_t)x - (1UL << exp)) << 16) >> exp);
y -= (uint32_t) ((delta * frac) >> 16);
}
for (int iter = 0; iter < x; iter++) {
uint32_t y2 = (uint32_t)mul32(y, y);
uint32_t xy2 = (uint32_t)(mul32(x, y2) >> 16);
y = (uint32_t)(mul32(y, (3u << 16) - xy2) >> 17);
}
static inline unsigned clz(uint32_t x)
{
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) {
n -= c;
x = y;
}
c >>= 1;
} while (c);
return n - x;
}
return y;
}
```
#### Bare-Metal Test Suite
```
$ source $HOME/riscv-none-elf-gcc/setenv
$ make clean
$ make
$ make run
```
RV32M (multiply and divide) is not permitted, so we have to modify the code in `fast_rsqrt()`.
```diff
- y -= (uint32_t) ((delta * frac) >> 16);
+ y -= (uint32_t) (mul32(delta, frac) >> 16);
```
Test Suite:
```c
static void test_fast_rsqrt(void)
{
uint32_t test_vals[] = {1, 2, 3, 4, 10, 16, 100, 1024, 65536};
for (int i = 0; i < sizeof(test_vals)/sizeof(test_vals[0]); i++) {
uint32_t x = test_vals[i];
uint32_t rs = fast_rsqrt(x);
TEST_LOGGER("x = ");
print_dec(x);
TEST_LOGGER("y = ");
print_dec(rs);
TEST_LOGGER("\n");
}
}
```
```
x = 1
y = 65536
x = 2
y = 46341
x = 3
y = 37837
x = 4
y = 32768
x = 10
y = 20724
x = 16
y = 16384
x = 100
y = 6553
x = 1024
y = 2048
x = 65536
y = 256
```