# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < [`winnie0710`](https://github.com/winnie0710/RISC-V) >
## Problem B : Encode uint32_t to uf8
### Abstract
#### goal
* Translate from C code into full RISC-V assembly programs
* Develop a simplified but informative use case to frame your assignment
* Include at least three test data cases
### clz
Use binary search to find the position of the first 1.
The number of operations can be reduced from a maximum of 32 to a fixed number of 5 iterations.
```c
static inline unsigned clz(uint32_t x)
{
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) {
n -= c;
x = y;
}
c >>= 1;
} while (c);
return n - x;
}
```
The corresponding RISC-V assembly are showing below.
```s
.data
test_x: .word 0x00001234
str: .asciz "CLZ Result: " # add "\0" on the end of string
.text
main:
la t1, test_x
lw a0, 0(t1)
jal ra, clz
mv a1, a0 # Put the result to a1 first to avoid being overwritten
jal ra, print
# 結束程式
li a7, 10 # Load System Service No. 10 (Exit)
ecall # Execute the system call to exit the program
clz:
# assume that input parameter x is stored at a0
addi s0, x0, 32 # n=32
addi s1, x0, 16 # c=16
mv s2, a0
addi t0, x0, 0 # y=0
clz_loop:
srl t0, s2, s1 # y = x >> c
bnez t0, clz_y_exist
srli s1, s1, 1
bnez s1, clz_loop
j clz_finish
clz_y_exist:
sub s0, s0, s1
mv s2, t0
srli s1, s1, 1
bnez s1, clz_loop
clz_finish:
sub a0, s0, s2
ret # return to main
print:
la a0, str # load string
li a7, 4 # print string
ecall
mv a0, a1
li a7, 1 # print integer
ecall
ret # go back to main
```
```
la a0, msg_test # "test data: "
li a7, 4 # a7=4, print string
ecall
mv a0, s2 # print i
li a7, 1 # a7=1, print integer
ecall
```
### uf8_decode
offset
```c
/* Decode uf8 to uint32_t */
uint32_t uf8_decode(uf8 fl)
{
uint32_t mantissa = fl & 0x0f; // last 4 bit
uint8_t exponent = fl >> 4; // first 4 bit
uint32_t offset = (0x7FFF >> (15 - exponent)) << 4;
return (mantissa << exponent) + offset;
}
```
The corresponding RISC-V assembly are showing below.
```s
.data
.text
main:
li a0, 0x2B
jal ra, decode
li a7, 10
ecall
decode:
andi t0, a0, 0x0F # m = fl & 0x0f
srli t1, a0, 4 # e = fl >> 4
li t2, 1
sll t2, t2, t1
addi t2, t2, -1
slli t2, t2, 4 # offset = ((1<<e)-1) << 4
sll t0, t0, t1
add a0, t0, t2 # value = (m << e) + offset
ret
```
### uf8_encode
$$\ value = overflow + mantissa * 2^{exponent}$$
```c
uf8 uf8_encode(uint32_t value)
{
/* Use CLZ for fast exponent calculation */
if (value < 16)
return value;
/* Find appropriate exponent using CLZ hint */
int lz = clz(value);
int msb = 31 - lz;
/* Start from a good initial guess */
uint8_t exponent = 0;
uint32_t overflow = 0;
if (msb >= 5) {
/* Estimate exponent - the formula is empirical */
exponent = msb - 4;
if (exponent > 15)
exponent = 15;
/* Calculate overflow for estimated exponent */
for (uint8_t e = 0; e < exponent; e++)
overflow = (overflow << 1) + 16;
/* Adjust if estimate was off */
while (exponent > 0 && value < overflow) {
overflow = (overflow - 16) >> 1;
exponent--;
}
}
/* Find exact exponent */
while (exponent < 15) {
uint32_t next_overflow = (overflow << 1) + 16;
if (value < next_overflow)
break;
overflow = next_overflow;
exponent++;
}
uint8_t mantissa = (value - overflow) >> exponent;
return (exponent << 4) | mantissa;
}
```
The corresponding RISC-V assembly are showing below.
```s
.data
.text
main:
li a0, 0x00235656
jal ra, start_encode
li a7, 10
ecall
start_encode:
mv s0, a0 # store a0 value
addi s1, x0, 31 # exponent
addi s2, x0, 0 # overflow
li t0, 0
li t1, 0 # msb
li t2, 1
bge a0, t0, encode
ret # return
encode:
jal ra, clz # a0 = lz
sub t1, s1, a0 # msb = 31 - clz
li t0, 5
blt t1, t0, exact_exponent # msb < 5, go to exact_exponent
addi s1, t1, -4 # exponent = msb-4
li t0, 16
li t1, 0 # set calculate_overflow counting number
blt s1, t0, calculate_overflow # exponent < 16, go to overflow
addi s1, x0, 15 # exponent = 15
calculate_overflow:
slli s2, s2, 1
addi s2, s2, 16
addi t1, t1, 1
blt t1, s1, calculate_overflow # e < exponent
adjust_overflow:
blt s1, t2, exact_exponent # exponent < 1, jump
bge s0, s2, exact_exponent #
addi s2, s2, -16
srli s2, s2, 1
addi s1, s1, -1
j adjust_overflow
exact_exponent:
addi t2, x0, 15
bge s1, t2, finish # exponent >= 15, out of while
slli t1, s2, 1 # set next_exponent
addi t1, t1, 16
blt s0, t1, finish
mv s2, t1
addi s1, s1, 1
j exact_exponent
clz:
# assume that input parameter x is stored at a0
addi sp, sp, -12
sw s0, 0(sp)
sw s1, 4(sp)
sw s2, 8(sp)
addi s0, x0, 32 # n=32
addi s1, x0, 16 # c=16
mv s2, a0
addi t0, x0, 0 # y=0
clz_loop:
srl t0, s2, s1 # y = x >> c
bnez t0, clz_y_exist
srli s1, s1, 1
bnez s1, clz_loop
j clz_finish
clz_y_exist:
sub s0, s0, s1
mv s2, t0
srli s1, s1, 1
bnez s1, clz_loop
clz_finish:
sub a0, s0, s2
lw s0, 0(sp)
lw s1, 4(sp)
lw s2, 8(sp)
addi sp, sp, 12
ret # return to encode
finish:
sub s0, s0, s2 # value ->mantissa
srl s0, s0, s1
slli s1, s1, 4
or a0, s1, s0
ret # return to main
```
### Test
:::spoiler fully RISC-V assemble
```code
.data
msg_pass: .asciz "All tests passed."
msg_fail: .asciz "Failed."
msg_mismatch_a: .asciz " mismatch: fl= "
msg_mismatch_b: .asciz " value= "
msg_mismatch_c: .asciz " fl2= "
msg_nl: .asciz "\n"
msg_exponent: .asciz "Exponent: "
msg_1round: .asciz "passed = "
.text
main:
jal ra, test # start test
beqz a0, print_fail # a0=0, fail
la a0, msg_pass
li a7, 4
ecall
li a7, 10
ecall
print_fail:
la a0, msg_fail
li a7, 4
ecall
li a7, 10
ecall
decode:
andi t0, a0, 0x0F # m = fl & 0x0f
srli t1, a0, 4 # e = fl >> 4
li t2, 1
sll t2, t2, t1
addi t2, t2, -1
slli t2, t2, 4 # offset = ((1<<e)-1) << 4
sll t0, t0, t1
add a0, t0, t2 # value = (m << e) + offset
ret
encode:
addi sp, sp, -16
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
sw s2, 12(sp)
mv s0, a0 # store a0 value
addi s1, x0, 0 # exponent
addi s2, x0, 0 # overflow
li t1, 0 # msb
li t2, 1
jal ra, clz # a0 = lz
li t0, 31
sub t1, t0, a0 # msb = 31 - clz
li t0, 5
blt t1, t0, exact_exponent # msb < 5, go to exact_exponent
addi s1, t1, -4 # exponent = msb-4
li t0, 16
li t1, 0 # set calculate_overflow counting number
blt s1, t0, calculate_overflow # exponent < 16, go to overflow
addi s1, x0, 15 # exponent = 15
calculate_overflow:
slli s2, s2, 1
addi s2, s2, 16
addi t1, t1, 1
blt t1, s1, calculate_overflow # e < exponent
adjust_overflow:
blt s1, t2, exact_exponent # exponent < 1, jump
bge s0, s2, exact_exponent #
addi s2, s2, -16
srli s2, s2, 1
addi s1, s1, -1
j adjust_overflow
exact_exponent:
addi t2, x0, 15
bge s1, t2, encode_finish # exponent >= 15, out of while
slli t1, s2, 1 # set next_exponent
addi t1, t1, 16
blt s0, t1, encode_finish
mv s2, t1
addi s1, s1, 1
j exact_exponent
clz:
# assume that input parameter x is stored at a0
addi sp, sp, -16
sw s0, 0(sp)
sw s1, 4(sp)
sw s2, 8(sp)
sw ra, 12(sp)
addi s0, x0, 32 # n=32
addi s1, x0, 16 # c=16
mv s2, a0
addi t0, x0, 0 # y=0
clz_loop:
srl t0, s2, s1 # y = x >> c
bnez t0, clz_y_exist
srli s1, s1, 1
bnez s1, clz_loop
j clz_finish
clz_y_exist:
sub s0, s0, s1
mv s2, t0
srli s1, s1, 1
bnez s1, clz_loop
clz_finish:
sub a0, s0, s2
lw s0, 0(sp)
lw s1, 4(sp)
lw s2, 8(sp)
lw ra, 12(sp)
addi sp, sp, 16
ret # return to encode
encode_finish:
la a0, msg_exponent
li a7, 4
ecall
mv a0, s1 # exponent
li a7, 1
ecall
sub s0, s0, s2 # value ->mantissa
srl s0, s0, s1
slli s1, s1, 4
or a0, s1, s0
lw ra, 0(sp)
lw s0, 4(sp)
lw s1, 8(sp)
lw s2, 12(sp)
addi sp, sp, 16
ret # return to main
test:
addi sp, sp, -20
sw ra, 0(sp)
sw s0, 4(sp) # previous_value
sw s1, 8(sp) # passed
sw s2, 12(sp) # i
sw s3, 16(sp) # value
addi s0, x0, -1 # previous_value = -1
addi s1, x0, 1 # passed = 1
addi s2, x0, 0 # i = 0
test_loop:
addi t1, x0, 256
bge s2, t1, test_done
mv a0, s2
jal ra, decode
mv s3, a0 # s3 = value
jal ra, encode
mv t1, a0 # t1 = fl2
la a0, msg_mismatch_a
li a7, 4
ecall
mv a0, s2 # fl
li a7, 1
ecall
la a0, msg_mismatch_b
li a7, 4
ecall
mv a0, s3 # value
li a7, 1
ecall
la a0, msg_mismatch_c
li a7, 4
ecall
mv a0, t1 # fl2
li a7, 1
ecall
la a0, msg_nl
li a7, 4
ecall
bne t1, s2, set_false # fail
slt t0, s0, s3 # ok
bnez t0, test_plus
set_false:
addi s1, x0, 0 # passed = false
test_plus:
mv s0, s3 # previous_value = value
addi s2, s2, 1 # i++
j test_loop
test_done:
mv a0, s1
lw ra, 0(sp)
lw s0, 4(sp) # previous_value
lw s1, 8(sp) # passed
lw s2, 12(sp) # i
lw s3, 16(sp) # value
addi sp, sp, 20
ret
```
:::
### Analysis
kkk
## Problem C : bfloat16 format
### Abstract
#### goal
Manually simulate floating-point arithmetic using bitwise operations directly on a 16-bit unsigned register.
```c
typedef struct {
uint16_t bits;
} bf16_t;
```
### bf16_add
```c
static inline bf16_t bf16_add(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
if (exp_a == 0xFF) {
if (mant_a)
return a;
if (exp_b == 0xFF)
return (mant_b || sign_a == sign_b) ? b : BF16_NAN();
return a;
}
if (exp_b == 0xFF)
return b;
if (!exp_a && !mant_a)
return b;
if (!exp_b && !mant_b)
return a;
if (exp_a)
mant_a |= 0x80;
if (exp_b)
mant_b |= 0x80;
int16_t exp_diff = exp_a - exp_b;
uint16_t result_sign;
int16_t result_exp;
uint32_t result_mant;
if (exp_diff > 0) {
result_exp = exp_a;
if (exp_diff > 8)
return a;
mant_b >>= exp_diff;
} else if (exp_diff < 0) {
result_exp = exp_b;
if (exp_diff < -8)
return b;
mant_a >>= -exp_diff;
} else {
result_exp = exp_a;
}
if (sign_a == sign_b) {
result_sign = sign_a;
result_mant = (uint32_t) mant_a + mant_b;
if (result_mant & 0x100) {
result_mant >>= 1;
if (++result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
} else {
if (mant_a >= mant_b) {
result_sign = sign_a;
result_mant = mant_a - mant_b;
} else {
result_sign = sign_b;
result_mant = mant_b - mant_a;
}
if (!result_mant)
return BF16_ZERO();
while (!(result_mant & 0x80)) {
result_mant <<= 1;
if (--result_exp <= 0)
return BF16_ZERO();
}
}
return (bf16_t) {
.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(result_mant & 0x7F),
};
}
```
The corresponding RISC-V assembly are showing below.
```s
main:
# a0 -> bf16_t a
srli s0, a0, 15 # sign a, 15
andi s0, s0, 1
srli s1, a0, 7 # exp a, 7-14
andi s1, s1, 0xFF
andi s2, a1, 0x7F # mantissa a, 0-6
#a1 -> bf16_t b
srli s3, a1, 15 # sign b, 15
andi s3, s3, 1
srli s4, a1, 7 # exp b, 7-14
andi s4, s4, 0xFF
andi s5, a1, 0x7F # mantissa b, 0-6
```
### bf16_mul
```c
```
The corresponding RISC-V assembly are showing below.
```s
```
## Leetcode137. Single Number II
Description: Given an integer array nums where every element appears three times except for one, which appears exactly once. Find the single element and return it.
You must implement a solution with a linear runtime complexity and use only constant extra space.
Constraints:
* 1 <= nums.length <= 3 * 104
* -2^31 <= nums[i] <= 2^31 - 1
* Each element in nums appears exactly three times except for one element which appears once.
### Soltion
#### C
First Version
```c
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
int singleNumber(int* nums, int numsSize) {
int result = 0;
int min_leading_zeros = 32;
for (int i = 0; i < numsSize; i++) {
if (nums[i] != 0) {
int leading_zeros = my_clz((uint32_t)nums[i]);
if (leading_zeros < min_leading_zeros) {
min_leading_zeros = leading_zeros;
}
}
}
for (int i = 0; i < 32 - min_leading_zeros; i++) {
int count = 0;
for (int j = 0; j < numsSize; j++) {
if (nums[j] & (1U << i)) {
count++;
}
}
if (count % 3 != 0) {
result |= (1U << i);
}
}
return result;
}
```
#### Assembly code
```s
.data
# Test Case 1: nums = [2, 2, 3, 2]
nums1:
.word 2, 2, 3, 2
nums1_size:
.word 4
result_str:
.asciz "Result: "
newline_str:
.asciz "\n"
int_buffer:
.zero 12 # Allocate and zero-initialize 12 bytes
.text
.globl main
main:
# Test Case 1
la a0, nums1 # Load address of nums1 array
lw a1, nums1_size # Load size of nums1 array
jal ra, singleNumber # Call singleNumber(nums1, size)
mv a2, a0 # Move result to a2 for printing
la a0, result_str # Load address of result string
jal ra, print_result # Print result
# Exit program
li a0, 0 # Exit code 0
li a7, 93 # ECALL for exit
ecall
# int singleNumber(int* nums, int numsSize)
singleNumber:
addi sp, sp, -36 # Allocate stack space
sw ra, 32(sp) # Save return address
sw s0, 28(sp) # Save s0
sw s1, 24(sp) # Save s1
sw s2, 20(sp) # Save s2
sw s3, 16(sp) # Save s3
sw s4, 12(sp) # Save s4
sw s5, 8(sp) # Save s5
sw s6, 4(sp) # Save s6
sw s7, 0(sp) # Save s7
mv s0, a0 # s0 = nums (int* nums)
mv s1, a1 # s1 = numsSize (int numsSize)
li s7, 0 # s7 = result = 0
# Initialize min_leading_zeros = 32
li s5, 32 # s5 = min_leading_zeros = 32
li s2, 0 # s2 = i = 0
find_min_leading_zeros_loop:
bge s2, s1, end_min_leading_zeros_loop
# Load nums[i]
slli t0, s2, 2 # t0 = i * 4
add t0, s0, t0 # t0 = &nums[i]
lw t1, 0(t0) # t1 = nums[i]
bnez t1, process_non_zero
addi s2, s2, 1 # i++
jal zero, find_min_leading_zeros_loop
process_non_zero:
# Call my_clz((uint32_t)nums[i])
mv a0, t1 # a0 = nums[i]
jal ra, my_clz # leading_zeros in a0
mv t2, a0 # t2 = leading_zeros
blt t2, s5, update_min_leading_zeros
jal zero, skip_update_min_leading_zeros
update_min_leading_zeros:
mv s5, t2 # min_leading_zeros = leading_zeros
skip_update_min_leading_zeros:
addi s2, s2, 1 # i++
jal zero, find_min_leading_zeros_loop
end_min_leading_zeros_loop:
# Compute max_bit_index = 32 - min_leading_zeros
li t0, 32
sub s6, t0, s5 # s6 = 32 - min_leading_zeros
# Initialize s2 = 0 (i)
li s2, 0 # s2 = i = 0
singleNumber_outer_loop:
bge s2, s6, singleNumber_end_outer_loop # if i >= (32 - min_leading_zeros), exit loop
# Initialize count = 0
li s3, 0 # s3 = count = 0
li s4, 0 # s4 = j = 0
singleNumber_inner_loop:
bge s4, s1, singleNumber_end_inner_loop # if j >= numsSize, exit inner loop
# Load nums[j]
slli t1, s4, 2 # t1 = j * 4
add t1, s0, t1 # t1 = &nums[j]
lw t2, 0(t1) # t2 = nums[j]
# Check if nums[j] & (1U << i)
li t3, 1
sll t3, t3, s2 # t3 = 1U << i
and t4, t2, t3 # t4 = nums[j] & (1U << i)
beqz t4, singleNumber_skip_increment # if zero, skip increment
# count = (count + 1) % 3
addi s3, s3, 1 # count++
li t5, 3
blt s3, t5, singleNumber_skip_reset # if count < 3, skip reset
li s3, 0 # count = 0
singleNumber_skip_reset:
# Continue to next iteration
singleNumber_skip_increment:
addi s4, s4, 1 # j++
jal zero, singleNumber_inner_loop
singleNumber_end_inner_loop:
# if count != 0
beqz s3, singleNumber_skip_bit_set # if count == 0, skip setting bit
# result |= (1U << i)
li t6, 1
sll t6, t6, s2 # t6 = 1U << i
or s7, s7, t6 # s7 |= t6
singleNumber_skip_bit_set:
addi s2, s2, 1 # i++
jal zero, singleNumber_outer_loop
singleNumber_end_outer_loop:
mv a0, s7 # Move result to a0
# Restore registers and stack
lw s7, 0(sp)
lw s6, 4(sp)
lw s5, 8(sp)
lw s4, 12(sp)
lw s3, 16(sp)
lw s2, 20(sp)
lw s1, 24(sp)
lw s0, 28(sp)
lw ra, 32(sp)
addi sp, sp, 36
ret
# int my_clz(uint32_t x)
my_clz:
addi sp, sp, -8 # Allocate stack space
sw ra, 4(sp) # Save return address
sw s0, 0(sp) # Save s0
mv s0, a0 # s0 = x
# if x == 0, return 32
beqz s0, my_clz_return_32
# if x < 0 (i.e., (int32_t)x < 0), return 0
bltz s0, my_clz_return_0
# count = 0
li t0, 0 # t0 = count
li t1, 31 # t1 = i
my_clz_loop:
blt t1, zero, my_clz_end_loop # if i < 0, end loop
li t2, 1
sll t2, t2, t1 # t2 = 1U << i
and t3, s0, t2 # t3 = x & (1U << i)
bnez t3, my_clz_end_loop
addi t0, t0, 1 # count++
addi t1, t1, -1 # i--
jal zero, my_clz_loop
my_clz_end_loop:
mv a0, t0 # Return count
jal zero, my_clz_exit
my_clz_return_32:
li a0, 32
jal zero, my_clz_exit
my_clz_return_0:
li a0, 0
my_clz_exit:
# Restore registers and return
lw s0, 0(sp)
lw ra, 4(sp)
addi sp, sp, 8
ret
# Simplified print_result function for Ripes simulator
print_result:
addi sp, sp, -8 # Allocate stack space
sw ra, 4(sp) # Save return address
sw a0, 0(sp) # Save a0 (address of result_str)
# Print "Result: "
mv a0, a0 # a0 already contains address of result_str
li a7, 4 # ECALL for print string
ecall
# Print integer in a2
mv a0, a2 # Move result to a0
jal ra, print_int # Call print_int
# Print newline
la a0, newline_str
li a7, 4
ecall
# Restore registers and stack
lw a0, 0(sp) # Restore a0
lw ra, 4(sp) # Restore ra
addi sp, sp, 8
ret
# Corrected print_int function
print_int:
# Convert integer to string and print
# Handle negative numbers
addi sp, sp, -36 # Adjust stack size for additional registers
sw ra, 32(sp)
sw t0, 28(sp)
sw t1, 24(sp)
sw t2, 20(sp)
sw t3, 16(sp)
sw t4, 12(sp)
sw t5, 8(sp)
sw a4, 4(sp) # Save a4
sw a5, 0(sp) # Save a5
mv t0, a0 # t0 = number
la t1, int_buffer # t1 = address of int_buffer
# Clear the buffer
li t2, 12 # t2 = buffer size
li t3, 0 # t3 = zero
mv a5, t1 # a5 = buffer pointer
clear_buffer_loop:
beqz t2, clear_buffer_done
sb t3, 0(a5)
addi a5, a5, 1
addi t2, t2, -1
jal zero, clear_buffer_loop
clear_buffer_done:
# Reset t1 to end of buffer
la t1, int_buffer # t1 = address of int_buffer
addi t1, t1, 11 # t1 points to int_buffer + 11 (one before end)
li t2, 48 # t2 = ASCII code for '0'
li t3, 0 # t3 = sign flag
li a4, 10 # a4 = 10
# Check if number is zero
beq t0, zero, print_int_zero
# Check if number is negative
blt t0, zero, print_int_negative
print_int_loop:
# t4 = t0 / 10 (quotient)
# t5 = t0 % 10 (remainder)
# Initialize t4 = 0 (quotient), t5 = t0 (remainder)
mv t5, t0 # t5 = t0 (remainder)
li t4, 0 # t4 = 0 (quotient)
print_int_divide_loop:
blt t5, a4, print_int_divide_end
sub t5, t5, a4 # t5 -= 10
addi t4, t4, 1 # t4 += 1
jal zero, print_int_divide_loop
print_int_divide_end:
# Now t4 is quotient, t5 is remainder
# Convert remainder to character
add t6, t5, t2 # t6 = t5 + '0'
sb t6, 0(t1) # Store character at t1
addi t1, t1, -1 # t1 -= 1
# Update t0 to quotient
mv t0, t4 # t0 = t4
# If t0 != 0, continue loop
bnez t0, print_int_loop
# After loop, check for sign
beqz t3, print_int_print # If not negative, proceed to print
# Add '-' sign
li t6, 45 # '-'
sb t6, 0(t1)
addi t1, t1, -1
print_int_print:
addi t1, t1, 1 # Adjust t1 to point to the start of string
# Print the number string
mv a0, t1 # a0 = pointer to start of string
li a7, 4
ecall
# Restore registers and stack
lw a5, 0(sp) # Restore a5
lw a4, 4(sp) # Restore a4
lw t5, 8(sp)
lw t4, 12(sp)
lw t3, 16(sp)
lw t2, 20(sp)
lw t1, 24(sp)
lw t0, 28(sp)
lw ra, 32(sp)
addi sp, sp, 36
ret
print_int_negative:
li t3, 1 # Set sign flag
neg t0, t0 # t0 = -t0
jal zero, print_int_loop
print_int_zero:
# Handle zero
li t6, 48 # '0'
sb t6, 0(t1)
addi t1, t1, -1
jal zero, print_int_print
```
### Test
gitgub
### Analysis
cycle

## 5-stage pipelined processor