owned this note
owned this note
Published
Linked with GitHub
# Assignment1: RISC-V Assembly and Instruction Pipeline
## Problem C in Quiz1
### C code
#### Orinigal version
Consider the C implementation of `__builtin_clz`:
```clike=
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
```
This function can count leading zeros but it using loop.
In worst case `x=32`, this loop will be executed 32 times.
#### No For loop version
In his book "Hackers’ Delight," Henry Warren suggests a technique to calculate the number of leading zeros for 32-bit unsigned integers.
This function used binary search to implement.
```clike=
uint32_t no_for_loop_clz(uint32_t x){
uint32_t count = 0;
if(x == 0){
return 32;
}
if(x <= 0x0000FFFF){
count += 16;
x <<= 16;
}
if (x <= 0x00FFFFFF){
count += 8;
x <<= 8;
}
if (x <= 0x0FFFFFFF){
count += 4;
x <<= 4;
}
if (x <= 0x3FFFFFFF){
count += 2;
x <<= 2;
}
if (x <= 0x7FFFFFFF){
count ++;
}
return count;
}
```
### Assembly code
#### Orinigal version
```asm=
my_clz:
#########################################################################
# argument:
# x <- a0
# variable:
# count <- t0
# i <- t1
# return value:
# count <- a0
#########################################################################
li t0, 0x0 # int count = 0
li t1, 0x1F # int i = 31
li t2, 0x1 # 1U
CLZ_LOOP:
sll t3, t2, t1 # t3 <- (1U << i)
and t3, a0, t3 # t3 <- x & (1U << i)
beq t3, x0, CLZ_IF_END
CLZ_IF:
mv a0, t0
ret
CLZ_IF_END:
addi t0, t0, 0x1 # count++
addi t1, t1, -0x1 # --i
bge t1, x0, CLZ_LOOP
mv a0, t0
ret
```
#### No For loop version
```asm=
no_for_loop_clz:
#########################################################################
# argument:
# x <- a0
# variable:
# count <- t0
# return value:
# count <- a0
#########################################################################
li t0, 0 # count = 0;
beq a0, x0, IF_0 # if(x == 0)
j IF_0_END
IF_0:
li a0, 32
ret
IF_0_END:
li t1, 0x0000FFFF
bltu a0, t1, IF_0x0000FFFF # if (x <= 0x00FFFFFF)
beq a0, t1, IF_0x0000FFFF
j IF_0x0000FFFF_END
IF_0x0000FFFF:
addi t0, t0, 16
slli a0, a0, 16
IF_0x0000FFFF_END:
li t1, 0x00FFFFFF
bltu a0, t1, IF_0x00FFFFFF # if (x <= 0x00FFFFFF)
beq a0, t1, IF_0x00FFFFFF
j IF_0x00FFFFFF_END
IF_0x00FFFFFF:
addi t0, t0, 8
slli a0, a0, 8
IF_0x00FFFFFF_END:
li t1, 0x0FFFFFFF
bltu a0, t1, IF_0FFFFFFF # if (x <= 0x0FFFFFFF)
beq a0, t1, IF_0FFFFFFF
j IF_0FFFFFFF_END
IF_0FFFFFFF:
addi t0, t0, 4
slli a0, a0, 4
IF_0FFFFFFF_END:
li t1, 0x3FFFFFFF
bltu a0, t1, IF_3FFFFFFF # if (x <= 0x3FFFFFFF)
beq a0, t1, IF_3FFFFFFF
j IF_3FFFFFFF_END
IF_3FFFFFFF:
addi t0, t0, 2
slli a0, a0, 2
IF_3FFFFFFF_END:
li t1, 0x7FFFFFFF
bltu a0, t1, IF_7FFFFFFF # if (x <= 0x7FFFFFFF)
beq a0, t1, IF_7FFFFFFF
j IF_7FFFFFFF_END
IF_7FFFFFFF:
addi t0, t0, 1
IF_7FFFFFFF_END:
mv a0, t0 # return count
ret
```
### Test program(Original version)
#### In C
```clike=
#include<stdio.h>
#include<stdint.h>
static int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
int main(int argc, char const *argv[])
{
int test_data[20] = {
25448, 14227, 22674, 28864,
31649, 20975, 22181, 11350,
20409, 18526, 1399, 10944,
28693, 24509, 29763, 25829,
15952, 20041, 12062, 27150
};
int golden[20] = {
17, 18, 17, 17,
17, 17, 17, 18,
17, 17, 21, 18,
17, 17, 17, 17,
18, 17, 18, 17
};
bool pass = true;
for (int i = 0; i <= 19; i++)
{
if (my_clz(test_data[i]) != golden[i]){
pass = false;
}
}
if (pass){
printf("All test data passed!");
}
else{
printf("Something wrong!");
}
return 0;
}
```
#### In Assembly
```asm=
.data
test_num: .word 20
test_data: .word 25448, 14227, 22674, 28864, 31649, 20975, 22181, 11350, 20409, 18526, 1399, 10944, 28693, 24509, 29763, 25829, 15952, 20041, 12062, 27150
golden:.word 17, 18, 17, 17, 17, 17, 17, 18, 17, 17, 21, 18, 17, 17, 17, 17, 18, 17, 18, 17
pass_msg: .string "All test data passed!\n"
error_msg: .string "Something wrong!\n"
.text
j MAIN
my_clz:
#########################################################################
# argument:
# x <- a0
# variable:
# count <- t0
# i <- t1
# return value:
# count <- a0
#########################################################################
li t0, 0x0 # int count = 0
li t1, 0x1F # int i = 31
li t2, 0x1 # 1U
CLZ_LOOP:
sll t3, t2, t1 # t3 <- (1U << i)
and t3, a0, t3 # t3 <- x & (1U << i)
beq t3, x0, CLZ_IF_END
CLZ_IF:
mv a0, t0
ret
CLZ_IF_END:
addi t0, t0, 0x1 # count++
addi t1, t1, -0x1 # --i
bge t1, x0, CLZ_LOOP
mv a0, t0
ret
MAIN:
li s0, 19
la t0, test_data # int test_data[20] = {25448, 1422, ...
la t1, golden # int golden[20] = {17, 18, 17, 17, ...
lw t2, test_num
li t3, 0x1 # bool pass = true;
li t4, 0x0 # int i = 0
MAIN_FOR:
lw a0, 0(t0) # a0 = test_data[0]
# caller save
addi sp, sp, -24
sw ra, 0(sp)
sw t0, 4(sp)
sw t1, 8(sp)
sw t2, 12(sp)
sw t3, 16(sp)
sw t4, 20(sp)
jal ra, my_clz
# retrieve caller save
lw ra, 0(sp)
lw t0, 4(sp)
lw t1, 8(sp)
lw t2, 12(sp)
lw t3, 16(sp)
lw t4, 20(sp)
addi sp, sp, 24
mv t5, a0
lw t6, 0(t1) # t6 = golden[0]
beq t5, t6, END_IF_ERROR # if (my_clz(test_data[i]) != golden[i])
IF_ERROR:
li t3, 0x0 # pass = false;
END_IF_ERROR:
addi t4, t4, 0x1 # i++
addi t0, t0, 0x4 # point to test_data[i]
addi t1, t1, 0x4 # point to test_data[i]
blt t4, t2, MAIN_FOR
END_MAIN_FOR:
bne t3, x0, IF_PASS
ELSE_NOT_PASS:
# printf("Something wrong!");
addi a0, x0, 1
la a1, error_msg
addi a2, x0, 22
addi a7, x0, 64
ecall
li a7, 10
ecall
IF_PASS:
# printf("All test data passed!");
addi a0, x0, 1
la a1, pass_msg
addi a2, x0, 22
addi a7, x0, 64
ecall
END_IF_PASS:
# return
li a7, 10
ecall
```
Output:

### Test progam(No for loop version)
#### In C
```clike=
#include<stdio.h>
#include<stdint.h>
uint32_t no_for_loop_clz(uint32_t x){
uint32_t count = 0;
if(x == 0){
return 32;
}
if(x <= 0x0000FFFF){
count += 16;
x <<= 16;
}
if (x <= 0x00FFFFFF){
count += 8;
x <<= 8;
}
if (x <= 0x0FFFFFFF){
count += 4;
x <<= 4;
}
if (x <= 0x3FFFFFFF){
count += 2;
x <<= 2;
}
if (x <= 0x7FFFFFFF){
count ++;
}
return count;
}
int main(int argc, char const *argv[])
{
int test_data[20] = {
2, 14227, 22674, 28864,
31649, 20975, 22181, 11350,
20409, 18526, 1399, 10944,
28693, 24509, 29763, 25829,
15952, 20041, 12062, 27150
};
int golden[20] = {
30, 18, 17, 17,
17, 17, 17, 18,
17, 17, 21, 18,
17, 17, 17, 17,
18, 17, 18, 17
};
bool pass = true;
for (int i = 0; i <= 19; i++)
{
if (no_for_loop_clz(test_data[i]) != golden[i]){
pass = false;
}
}
if (pass){
printf("All test data passed!");
}
else{
printf("Something wrong!");
}
return 0;
}
```
#### In Assembly
```asm=
.data
test_num: .word 20
test_data: .word 2, 14227, 22674, 28864, 31649, 20975, 22181, 11350, 20409, 18526, 1399, 10944, 28693, 24509, 29763, 25829, 15952, 20041, 12062, 27150
golden:.word 30, 18, 17, 17, 17, 17, 17, 18, 17, 17, 21, 18, 17, 17, 17, 17, 18, 17, 18, 17
pass_msg: .string "All test data passed!\n"
error_msg: .string "Something wrong!\n"
.text
j MAIN
no_for_loop_clz:
#########################################################################
# argument:
# x <- a0
# variable:
# count <- t0
# return value:
# count <- a0
#########################################################################
li t0, 0 # count = 0;
beq a0, x0, IF_0 # if(x == 0)
j IF_0_END
IF_0:
li a0, 32
ret
IF_0_END:
li t1, 0x0000FFFF
bltu a0, t1, IF_0x0000FFFF # if (x <= 0x00FFFFFF)
beq a0, t1, IF_0x0000FFFF
j IF_0x0000FFFF_END
IF_0x0000FFFF:
addi t0, t0, 16
slli a0, a0, 16
IF_0x0000FFFF_END:
li t1, 0x00FFFFFF
bltu a0, t1, IF_0x00FFFFFF # if (x <= 0x00FFFFFF)
beq a0, t1, IF_0x00FFFFFF
j IF_0x00FFFFFF_END
IF_0x00FFFFFF:
addi t0, t0, 8
slli a0, a0, 8
IF_0x00FFFFFF_END:
li t1, 0x0FFFFFFF
bltu a0, t1, IF_0FFFFFFF # if (x <= 0x0FFFFFFF)
beq a0, t1, IF_0FFFFFFF
j IF_0FFFFFFF_END
IF_0FFFFFFF:
addi t0, t0, 4
slli a0, a0, 4
IF_0FFFFFFF_END:
li t1, 0x3FFFFFFF
bltu a0, t1, IF_3FFFFFFF # if (x <= 0x3FFFFFFF)
beq a0, t1, IF_3FFFFFFF
j IF_3FFFFFFF_END
IF_3FFFFFFF:
addi t0, t0, 2
slli a0, a0, 2
IF_3FFFFFFF_END:
li t1, 0x7FFFFFFF
bltu a0, t1, IF_7FFFFFFF # if (x <= 0x7FFFFFFF)
beq a0, t1, IF_7FFFFFFF
j IF_7FFFFFFF_END
IF_7FFFFFFF:
addi t0, t0, 1
IF_7FFFFFFF_END:
mv a0, t0 # return count
ret
MAIN:
li s0, 19
la t0, test_data # int test_data[20] = {25448, 1422, ...
la t1, golden # int golden[20] = {17, 18, 17, 17, ...
lw t2, test_num
li t3, 0x1 # bool pass = true;
li t4, 0x0 # int i = 0
MAIN_FOR:
lw a0, 0(t0) # a0 = test_data[0]
# caller save
addi sp, sp, -24
sw ra, 0(sp)
sw t0, 4(sp)
sw t1, 8(sp)
sw t2, 12(sp)
sw t3, 16(sp)
sw t4, 20(sp)
jal ra, no_for_loop_clz
# retrieve caller save
lw ra, 0(sp)
lw t0, 4(sp)
lw t1, 8(sp)
lw t2, 12(sp)
lw t3, 16(sp)
lw t4, 20(sp)
addi sp, sp, 24
mv t5, a0
lw t6, 0(t1) # t6 = golden[0]
beq t5, t6, END_IF_ERROR # if (my_clz(test_data[i]) != golden[i])
IF_ERROR:
li t3, 0x0 # pass = false;
END_IF_ERROR:
addi t4, t4, 0x1 # i++
addi t0, t0, 0x4 # point to test_data[i]
addi t1, t1, 0x4 # point to test_data[i]
blt t4, t2, MAIN_FOR
END_MAIN_FOR:
bne t3, x0, IF_PASS
ELSE_NOT_PASS:
# printf("Something wrong!");
addi a0, x0, 1
la a1, error_msg
addi a2, x0, 22
addi a7, x0, 64
ecall
li a7, 10
ecall
IF_PASS:
# printf("All test data passed!");
addi a0, x0, 1
la a1, pass_msg
addi a2, x0, 22
addi a7, x0, 64
ecall
END_IF_PASS:
# return
li a7, 10
ecall
```
Output:

### Comparison
||Original version|No for loop version|
|-|-|-|
|Cycles|4319|1495|
|Instrs. retired|2733|1065|
|CPI|1.58|1.4|
|IPC|0.633|0.712|
|Clock rate|457.75 Hz|534.50 Hz|
By comparing these results, the no-loop version is better than the original version.
## binary GCD algorithm
Greatest common divisor (G.C.D) algorithm is a efficient way to find greatest common factor. It mod each other until equal to zero.
ex.
1. Find GCD of 12 and 30
- 30%12 = `6`
- 12%6 = `0`
- `6` is GCD of 12 and 30
2. Find GCD of 123 and 36
- 123%36 = 15
- 36%15 = 6
- 15%6 = `3`
- 6%3 = `0`
- `3` is GCD of 123 and 36
Josef Stein introduced the binary GCD algorithm, which utilizes bitwise operations and avoids the `%` operation. This approach simplifies the computation of the greatest common divisor (GCD) for computers, making it more efficient and faster.
### C code
This is a binary GCD algorithm with `no_for_loop_clz(x)`
```clike=
#include <stdio.h>
#include <stdint.h>
uint32_t reverse_bits(uint32_t x){
x = (x >> 1) & 0x55555555 | (x & 0x55555555) << 1;
x = (x >> 2) & 0x33333333 | (x & 0x33333333) << 2;
x = (x >> 4) & 0x0F0F0F0F | (x & 0x0F0F0F0F) << 4;
x = (x >> 8) & 0x00FF00FF | (x & 0x00FF00FF) << 8;
x = (x >> 16) & 0x0000FFFF | (x & 0x0000FFFF) << 16;
return x;
}
uint32_t no_for_loop_clz(uint32_t x){
uint32_t count = 0;
if(x == 0){
return 32;
}
if(x <= 0x0000FFFF){
count += 16;
x <<= 16;
}
if (x <= 0x00FFFFFF){
count += 8;
x <<= 8;
}
if (x <= 0x0FFFFFFF){
count += 4;
x <<= 4;
}
if (x <= 0x3FFFFFFF){
count += 2;
x <<= 2;
}
if (x <= 0x7FFFFFFF){
count ++;
}
return count;
}
uint32_t trailing_zeros(uint32_t x) {
// reverse bits in x
x = reverse_bits(x);
// count leadnig zeros
uint32_t count = no_for_loop_clz(x);
return count;
}
void swap(uint32_t *a, uint32_t *b) {
uint32_t temp = *a;
*a = *b;
*b = temp;
}
uint32_t gcd(uint32_t u, uint32_t v) {
// Base cases: gcd(n, 0) = gcd(0, n) = n
if (u == 0) return v;
if (v == 0) return u;
// Using identities 2 and 3:
// gcd(2^i * u, 2^j * v) = 2^k * gcd(u, v) with u, v odd and k = min(i, j)
// 2^k is the greatest power of two that divides both 2^i * u and 2^j * v
int i = trailing_zeros(u);
u >>= i;
int j = trailing_zeros(v);
v >>= j;
int k = (i < j) ? i : j;
while (true) {
// Swap if necessary so u <= v
if (u > v) {
swap(&u, &v);
}
// Identity 4: gcd(u, v) = gcd(u, v - u) as u <= v and u, v are both odd
v -= u;
// v becomes even
if (v == 0) {
// Identity 1: gcd(u, 0) = u
// The shift by k is necessary to add back the 2^k factor that was removed before the loop
return u << k;
}
// Identity 3: gcd(u, 2^j * v) = gcd(u, v) as u is odd
v >>= trailing_zeros(v);
}
}
```
There is a Rust version in [wiki](https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
### Assembly code
```asm=
no_for_loop_clz:
#########################################################################
# argument:
# x <- a0
# variable:
# count <- t0
# return value:
# count <- a0
#########################################################################
li t0, 0 # count = 0;
beq a0, x0, IF_0 # if(x == 0)
j IF_0_END
IF_0:
li a0, 32
ret
IF_0_END:
li t1, 0x0000FFFF
bltu a0, t1, IF_0x0000FFFF # if (x <= 0x00FFFFFF)
beq a0, t1, IF_0x0000FFFF
j IF_0x0000FFFF_END
IF_0x0000FFFF:
addi t0, t0, 16
slli a0, a0, 16
IF_0x0000FFFF_END:
li t1, 0x00FFFFFF
bltu a0, t1, IF_0x00FFFFFF # if (x <= 0x00FFFFFF)
beq a0, t1, IF_0x00FFFFFF
j IF_0x00FFFFFF_END
IF_0x00FFFFFF:
addi t0, t0, 8
slli a0, a0, 8
IF_0x00FFFFFF_END:
li t1, 0x0FFFFFFF
bltu a0, t1, IF_0FFFFFFF # if (x <= 0x0FFFFFFF)
beq a0, t1, IF_0FFFFFFF
j IF_0FFFFFFF_END
IF_0FFFFFFF:
addi t0, t0, 4
slli a0, a0, 4
IF_0FFFFFFF_END:
li t1, 0x3FFFFFFF
bltu a0, t1, IF_3FFFFFFF # if (x <= 0x3FFFFFFF)
beq a0, t1, IF_3FFFFFFF
j IF_3FFFFFFF_END
IF_3FFFFFFF:
addi t0, t0, 2
slli a0, a0, 2
IF_3FFFFFFF_END:
li t1, 0x7FFFFFFF
bltu a0, t1, IF_7FFFFFFF # if (x <= 0x7FFFFFFF)
beq a0, t1, IF_7FFFFFFF
j IF_7FFFFFFF_END
IF_7FFFFFFF:
addi t0, t0, 1
IF_7FFFFFFF_END:
mv a0, t0 # return count
ret
reverse_bits:
#########################################################################
# argument:
# n <- a0
# variable:
#
# return value:
# n < - a0
#########################################################################
li t2, 0x55555555
srli t0, a0, 0x1
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x1
or a0, t1, t0
li t2, 0x33333333
srli t0, a0, 0x2
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x2
or a0, t1, t0
li t2, 0x0F0F0F0F
srli t0, a0, 0x4
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x4
or a0, t1, t0
li t2, 0x00FF00FF
srli t0, a0, 0x8
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x8
or a0, t1, t0
li t2, 0x0000FFFF
srli t0, a0, 0x10
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x10
or a0, t1, t0
ret
swap:
#########################################################################
# argument:
# a <- a0
# b <- a1
# variable:
#
# return value:
# void
#########################################################################
mv t0, a0
mv a0, a1
mv a1, t0
ret
trailing_zeros:
#########################################################################
# argument:
# x <- a0
# variable:
#
# return value:
# count <- a0
#########################################################################
# caller save
addi sp, sp, -4
sw ra, 0(sp)
jal ra, reverse_bits
lw ra, 0(sp)
addi sp, sp, 4
# caller save
addi sp, sp, -4
sw ra, 0(sp)
jal ra, no_for_loop_clz
lw ra, 0(sp)
addi sp, sp, 4
ret
gcd:
#########################################################################
# argument:
# u <- a0
# v <- a1
# variable:
# i <- t0
# j <- t1
# k <- t2
# return value:
# G.C.D. <- a0
#########################################################################
bne a0, x0, END_IF_U_EQ_ZERO
IF_U_EQ_ZERO:
mv a0, a1 # return v
ret
END_IF_U_EQ_ZERO:
bne a1, x0, END_IF_V_EQ_ZERO
IF_V_EQ_ZERO:
ret # return u
END_IF_V_EQ_ZERO:
# caller save
addi sp, sp, -12
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
jal ra, trailing_zeros
mv t0, a0 # int i = trailing_zeros(u)
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
addi sp, sp, 12
srl a0, a0, t0 # u >>= i;
# caller save
addi sp, sp, -16
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
sw t0, 12(sp)
mv a0, a1 # trailing_zeros(v)
jal ra, trailing_zeros
mv t1, a0 # int j = trailing_zeros(v)
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
lw t0, 12(sp)
addi sp, sp, 16
srl a1, a1, t1 # v >>= j;
blt t0, t1, IF_I_LESS_THAN_J # int k = (i < j) ? i : j;
j ELSE_I_LESS_THAN_J
IF_I_LESS_THAN_J:
mv t2, t0
j END_I_LESS_THAN_J
ELSE_I_LESS_THAN_J:
mv t2, t1
END_I_LESS_THAN_J:
WHILE_TRUE:
bgt a0, a1, IF_U_BIGGER_THAN # if (u > v)
j END_IF_U_BIGGER_THAN
IF_U_BIGGER_THAN:
# caller save
addi sp, sp, -16
sw ra, 0(sp)
sw t0, 4(sp)
sw t1, 8(sp)
sw t2, 12(sp)
jal ra, swap # swap(&u, &v);
lw ra, 0(sp)
lw t0, 4(sp)
lw t1, 8(sp)
lw t2, 12(sp)
addi sp, sp, 16
END_IF_U_BIGGER_THAN:
sub a1, a1, a0 # v -= u;
beq a1, x0, IF_V_EQ_ZERO_IN_LOOP
j END_IF_V_EQ_ZERO_IN_LOOP
IF_V_EQ_ZERO_IN_LOOP:
sll a0, a0, t2 # return u << k;
ret
END_IF_V_EQ_ZERO_IN_LOOP:
# caller save
addi sp, sp, -24
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
sw t0, 12(sp)
sw t1, 16(sp)
sw t2, 20(sp)
mv a0, a1 # trailing_zeros(v);
jal ra, trailing_zeros
mv t3, a0
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
lw t0, 12(sp)
lw t1, 16(sp)
lw t2, 20(sp)
addi sp, sp, 24
srl a1, a1, t3 # v >>= trailing_zeros(v);
j WHILE_TRUE
END_WHILE_TRUE:
```
To avoid redundant function calls, we can merge `no_for_loop_clz()`, `reverse_bits()`, and `trailing_zeros()` into a single `trailing_zeros()` function. Also, instead of calling `swap()`, we can directly swap registers `a0` and `a1`.
### C code
```clike=
#include <stdio.h>
#include <stdint.h>
uint32_t trailing_zeros(uint32_t x) {
// reverse bits in x
x = (x >> 1) & 0x55555555 | (x & 0x55555555) << 1;
x = (x >> 2) & 0x33333333 | (x & 0x33333333) << 2;
x = (x >> 4) & 0x0F0F0F0F | (x & 0x0F0F0F0F) << 4;
x = (x >> 8) & 0x00FF00FF | (x & 0x00FF00FF) << 8;
x = (x >> 16) & 0x0000FFFF | (x & 0x0000FFFF) << 16;
// count leadnig zeros
uint32_t count = 0;
if(x == 0){
return 32;
}
if(x <= 0x0000FFFF){
count += 16;
x <<= 16;
}
if (x <= 0x00FFFFFF){
count += 8;
x <<= 8;
}
if (x <= 0x0FFFFFFF){
count += 4;
x <<= 4;
}
if (x <= 0x3FFFFFFF){
count += 2;
x <<= 2;
}
if (x <= 0x7FFFFFFF){
count ++;
}
return count;
}
uint32_t gcd(uint32_t u, uint32_t v) {
// Base cases: gcd(n, 0) = gcd(0, n) = n
if (u == 0) return v;
if (v == 0) return u;
// Using identities 2 and 3:
// gcd(2^i * u, 2^j * v) = 2^k * gcd(u, v) with u, v odd and k = min(i, j)
// 2^k is the greatest power of two that divides both 2^i * u and 2^j * v
int i = trailing_zeros(u);
u >>= i;
int j = trailing_zeros(v);
v >>= j;
int k = (i < j) ? i : j;
while (1) {
// Swap if necessary so u <= v
if (u > v) {
int temp = u;
u = v;
v = temp;
}
// Identity 4: gcd(u, v) = gcd(u, v - u) as u <= v and u, v are both odd
v -= u;
// v is now even
if (v == 0) {
// Identity 1: gcd(u, 0) = u
// The shift by k is necessary to add back the 2^k factor that was removed before the loop
return u << k;
}
// Identity 3: gcd(u, 2^j * v) = gcd(u, v) as u is odd
v >>= trailing_zeros(v);
}
}
```
### Assembly code
```asm=
trailing_zeros:
#########################################################################
# argument:
# x <- a0
# variable:
#
# return value:
# count <- a0
#########################################################################
# reverse bits
li t2, 0x55555555
srli t0, a0, 0x1
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x1
or a0, t1, t0
li t2, 0x33333333
srli t0, a0, 0x2
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x2
or a0, t1, t0
li t2, 0x0F0F0F0F
srli t0, a0, 0x4
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x4
or a0, t1, t0
li t2, 0x00FF00FF
srli t0, a0, 0x8
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x8
or a0, t1, t0
li t2, 0x0000FFFF
srli t0, a0, 0x10
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x10
or a0, t1, t0
# count leading zeros
li t0, 0 # count = 0;
beq a0, x0, IF_0 # if(x == 0)
j IF_0_END
IF_0:
li a0, 32
ret
IF_0_END:
li t1, 0x0000FFFF
bltu a0, t1, IF_0x0000FFFF # if (x <= 0x00FFFFFF)
beq a0, t1, IF_0x0000FFFF
j IF_0x0000FFFF_END
IF_0x0000FFFF:
addi t0, t0, 16
slli a0, a0, 16
IF_0x0000FFFF_END:
li t1, 0x00FFFFFF
bltu a0, t1, IF_0x00FFFFFF # if (x <= 0x00FFFFFF)
beq a0, t1, IF_0x00FFFFFF
j IF_0x00FFFFFF_END
IF_0x00FFFFFF:
addi t0, t0, 8
slli a0, a0, 8
IF_0x00FFFFFF_END:
li t1, 0x0FFFFFFF
bltu a0, t1, IF_0FFFFFFF # if (x <= 0x0FFFFFFF)
beq a0, t1, IF_0FFFFFFF
j IF_0FFFFFFF_END
IF_0FFFFFFF:
addi t0, t0, 4
slli a0, a0, 4
IF_0FFFFFFF_END:
li t1, 0x3FFFFFFF
bltu a0, t1, IF_3FFFFFFF # if (x <= 0x3FFFFFFF)
beq a0, t1, IF_3FFFFFFF
j IF_3FFFFFFF_END
IF_3FFFFFFF:
addi t0, t0, 2
slli a0, a0, 2
IF_3FFFFFFF_END:
li t1, 0x7FFFFFFF
bltu a0, t1, IF_7FFFFFFF # if (x <= 0x7FFFFFFF)
beq a0, t1, IF_7FFFFFFF
j IF_7FFFFFFF_END
IF_7FFFFFFF:
addi t0, t0, 1
IF_7FFFFFFF_END:
mv a0, t0 # return count
ret
gcd:
#########################################################################
# argument:
# u <- a0
# v <- a1
# variable:
# i <- t0
# j <- t1
# k <- t2
# return value:
# G.C.D. <- a0
#########################################################################
bne a0, x0, END_IF_U_EQ_ZERO
IF_U_EQ_ZERO:
mv a0, a1 # return v
ret
END_IF_U_EQ_ZERO:
bne a1, x0, END_IF_V_EQ_ZERO
IF_V_EQ_ZERO:
ret # return u
END_IF_V_EQ_ZERO:
# caller save
addi sp, sp, -12
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
jal ra, trailing_zeros
mv t0, a0 # int i = trailing_zeros(u)
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
addi sp, sp, 12
srl a0, a0, t0 # u >>= i;
# caller save
addi sp, sp, -16
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
sw t0, 12(sp)
mv a0, a1 # trailing_zeros(v)
jal ra, trailing_zeros
mv t1, a0 # int j = trailing_zeros(v)
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
lw t0, 12(sp)
addi sp, sp, 16
srl a1, a1, t1 # v >>= j;
blt t0, t1, IF_I_LESS_THAN_J # int k = (i < j) ? i : j;
j ELSE_I_LESS_THAN_J
IF_I_LESS_THAN_J:
mv t2, t0
j END_I_LESS_THAN_J
ELSE_I_LESS_THAN_J:
mv t2, t1
END_I_LESS_THAN_J:
WHILE_TRUE:
bgt a0, a1, IF_U_BIGGER_THAN # if (u > v)
j END_IF_U_BIGGER_THAN
IF_U_BIGGER_THAN:
mv t3, a0
mv a0, a1
mv a1, t3
END_IF_U_BIGGER_THAN:
sub a1, a1, a0 # v -= u;
beq a1, x0, IF_V_EQ_ZERO_IN_LOOP
j END_IF_V_EQ_ZERO_IN_LOOP
IF_V_EQ_ZERO_IN_LOOP:
sll a0, a0, t2 # return u << k;
ret
END_IF_V_EQ_ZERO_IN_LOOP:
# caller save
addi sp, sp, -24
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
sw t0, 12(sp)
sw t1, 16(sp)
sw t2, 20(sp)
mv a0, a1 # trailing_zeros(v);
jal ra, trailing_zeros
mv t3, a0 # t3 = trailing_zeros(v);
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
lw t0, 12(sp)
lw t1, 16(sp)
lw t2, 20(sp)
addi sp, sp, 24
srl a1, a1, t3 # v >>= register t3;
j WHILE_TRUE
END_WHILE_TRUE:
```
### Test
To test this code, I wrote a main() function.
```clike=
int main(int argc, char const *argv[])
{
int test_data1[20] = {
323, 929, 728, 175,
795, 464, 117, 994,
537, 629, 744, 254,
359, 554, 624, 231,
309, 198, 734, 531
};
int test_data2[20] = {
384, 575, 450, 456,
985, 5, 572, 314,
411, 481, 393, 630,
352, 742, 17, 279,
130, 893, 770, 486
};
int golden[20] = {
1, 1, 2, 1,
5, 1, 13, 2,
3, 37, 3, 2,
1, 2, 1, 3,
1, 1, 2, 9
};
bool pass = true;
for (int i = 0; i < 19; i++)
{
if (gcd(test_data1[i], test_data2[i]) != golden[i]){
pass = false;
}
}
if (pass){
printf("All test data passed!");
}
else{
printf("Some thing wrong!");
}
return 0;
}
```
Combine this code with the `gcd()` function into a single RISC-V implementation.
```asm=
.data
test_data1: .word 323, 929, 728, 175, 795, 464, 117, 994, 537, 629, 744, 254, 359, 554, 624, 231, 309, 198, 734, 531
test_data2: .word 384, 575, 450, 456, 985, 5, 572, 314, 411, 481, 393, 630, 352, 742, 17, 279, 130, 893, 770, 486
golden: .word 1, 1, 2, 1, 5, 1, 13, 2, 3, 37, 3, 2, 1, 2, 1, 3, 1, 1, 2, 9
test_num: .word 20
pass_msg: .string "All test data passed!\n"
error_msg: .string "Something wrong!\n"
.text
j MAIN
trailing_zeros:
#########################################################################
# argument:
# x <- a0
# variable:
#
# return value:
# count <- a0
#########################################################################
# reverse bits
li t2, 0x55555555
srli t0, a0, 0x1
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x1
or a0, t1, t0
li t2, 0x33333333
srli t0, a0, 0x2
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x2
or a0, t1, t0
li t2, 0x0F0F0F0F
srli t0, a0, 0x4
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x4
or a0, t1, t0
li t2, 0x00FF00FF
srli t0, a0, 0x8
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x8
or a0, t1, t0
li t2, 0x0000FFFF
srli t0, a0, 0x10
and t0, t0, t2
and t1, a0, t2
slli t1, t1, 0x10
or a0, t1, t0
# count leading zeros
li t0, 0 # count = 0;
beq a0, x0, IF_0 # if(x == 0)
j IF_0_END
IF_0:
li a0, 32
ret
IF_0_END:
li t1, 0x0000FFFF
bltu a0, t1, IF_0x0000FFFF # if (x <= 0x00FFFFFF)
beq a0, t1, IF_0x0000FFFF
j IF_0x0000FFFF_END
IF_0x0000FFFF:
addi t0, t0, 16
slli a0, a0, 16
IF_0x0000FFFF_END:
li t1, 0x00FFFFFF
bltu a0, t1, IF_0x00FFFFFF # if (x <= 0x00FFFFFF)
beq a0, t1, IF_0x00FFFFFF
j IF_0x00FFFFFF_END
IF_0x00FFFFFF:
addi t0, t0, 8
slli a0, a0, 8
IF_0x00FFFFFF_END:
li t1, 0x0FFFFFFF
bltu a0, t1, IF_0FFFFFFF # if (x <= 0x0FFFFFFF)
beq a0, t1, IF_0FFFFFFF
j IF_0FFFFFFF_END
IF_0FFFFFFF:
addi t0, t0, 4
slli a0, a0, 4
IF_0FFFFFFF_END:
li t1, 0x3FFFFFFF
bltu a0, t1, IF_3FFFFFFF # if (x <= 0x3FFFFFFF)
beq a0, t1, IF_3FFFFFFF
j IF_3FFFFFFF_END
IF_3FFFFFFF:
addi t0, t0, 2
slli a0, a0, 2
IF_3FFFFFFF_END:
li t1, 0x7FFFFFFF
bltu a0, t1, IF_7FFFFFFF # if (x <= 0x7FFFFFFF)
beq a0, t1, IF_7FFFFFFF
j IF_7FFFFFFF_END
IF_7FFFFFFF:
addi t0, t0, 1
IF_7FFFFFFF_END:
mv a0, t0 # return count
ret
gcd:
#########################################################################
# argument:
# u <- a0
# v <- a1
# variable:
# i <- t0
# j <- t1
# k <- t2
# return value:
# G.C.D. <- a0
#########################################################################
bne a0, x0, END_IF_U_EQ_ZERO
IF_U_EQ_ZERO:
mv a0, a1 # return v
ret
END_IF_U_EQ_ZERO:
bne a1, x0, END_IF_V_EQ_ZERO
IF_V_EQ_ZERO:
ret # return u
END_IF_V_EQ_ZERO:
# caller save
addi sp, sp, -12
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
jal ra, trailing_zeros
mv t0, a0 # int i = trailing_zeros(u)
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
addi sp, sp, 12
srl a0, a0, t0 # u >>= i;
# caller save
addi sp, sp, -16
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
sw t0, 12(sp)
mv a0, a1 # trailing_zeros(v)
jal ra, trailing_zeros
mv t1, a0 # int j = trailing_zeros(v)
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
lw t0, 12(sp)
addi sp, sp, 16
srl a1, a1, t1 # v >>= j;
blt t0, t1, IF_I_LESS_THAN_J # int k = (i < j) ? i : j;
j ELSE_I_LESS_THAN_J
IF_I_LESS_THAN_J:
mv t2, t0
j END_I_LESS_THAN_J
ELSE_I_LESS_THAN_J:
mv t2, t1
END_I_LESS_THAN_J:
WHILE_TRUE:
bgt a0, a1, IF_U_BIGGER_THAN # if (u > v)
j END_IF_U_BIGGER_THAN
IF_U_BIGGER_THAN:
mv t3, a0
mv a0, a1
mv a1, t3
END_IF_U_BIGGER_THAN:
sub a1, a1, a0 # v -= u;
beq a1, x0, IF_V_EQ_ZERO_IN_LOOP
j END_IF_V_EQ_ZERO_IN_LOOP
IF_V_EQ_ZERO_IN_LOOP:
sll a0, a0, t2 # return u << k;
ret
END_IF_V_EQ_ZERO_IN_LOOP:
# caller save
addi sp, sp, -24
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp)
sw t0, 12(sp)
sw t1, 16(sp)
sw t2, 20(sp)
mv a0, a1 # trailing_zeros(v);
jal ra, trailing_zeros
mv t3, a0 # t3 = trailing_zeros(v);
lw ra, 0(sp)
lw a0, 4(sp)
lw a1, 8(sp)
lw t0, 12(sp)
lw t1, 16(sp)
lw t2, 20(sp)
addi sp, sp, 24
srl a1, a1, t3 # v >>= register t3;
j WHILE_TRUE
END_WHILE_TRUE:
MAIN:
la t0, test_data1 # t0 -> test_data1[0]
la t1, test_data2 # t1 -> test_data2[0]
la t2, golden # t2 -> golden[0]
li t3, 0x1 # bool pass = true;
lw t4, test_num # t4 = 20
li t5, 0x0 # int i = 0
MAIN_FOR:
lw t6, 0(t0) # test_data1[t0]
mv a0, t6
lw t6, 0(t1) # test_data2[t0]
mv a1, t6
# caller save
addi sp, sp, -28
sw ra, 0(sp)
sw t0, 4(sp)
sw t1, 8(sp)
sw t2, 12(sp)
sw t3, 16(sp)
sw t4, 20(sp)
sw t5, 24(sp)
jal ra, gcd
lw ra, 0(sp)
lw t0, 4(sp)
lw t1, 8(sp)
lw t2, 12(sp)
lw t3, 16(sp)
lw t4, 20(sp)
lw t5, 24(sp)
addi sp, sp, 28
lw t6, 0(t2)
bne a0, t6, IF_ERROR
j END_IF_ERROR
IF_ERROR:
li t3, 0x0
END_IF_ERROR:
addi t0, t0, 4
addi t1, t1, 4
addi t2, t2, 4
addi t5, t5, 0x1 # i++
blt t5, t4, MAIN_FOR # i <= 19
END_MAIN_FOR:
beq t3, x0, ELSE_IF_PASS
IF_PASS:
# printf("All test data passed!");
addi a0, x0, 1
la a1, pass_msg
addi a2, x0, 22
addi a7, x0, 64
ecall
j END_IF_PASS
ELSE_IF_PASS:
# printf("Something wrong!");
addi a0, x0, 1
la a1, error_msg
addi a2, x0, 22
addi a7, x0, 64
ecall
END_IF_PASS:
# return
li a7, 10
ecall
```
Output:

||Before function merge|After function merge|
|-|-|-|
|Cycles|14291|11651|
|Instrs. retired|10901|9221|
|CPI|1.31|1.26|
|IPC|0.763|0.791|
|Clock rate|20.33 KHz|10.81 KHz|
After function merge, the performance become better.
## Pipeline stage analysis

Take `and x6 x10 x7` in `reverse_bits` for example.
```=
10c: 00757333 and x6 x10 x7
```
### IF

The `PC` register points to the instruction memory at `0x000001C`. The instruction memory outputs the instruction at `0x000001C`, which is machine code translated from `and x6, x10, x7`. This instruction and the corresponding data are then placed into the IFID pipeline register.
### ID

The decode unit will take the instruction and output the indices for `rs1`, `rs2`, and `rd`. In this example, `rs1`, `rs2`, and `rd` are $10_{10}=0x0a_{16}$,$7_{10}=0x07_{16}$, and$6_{10}=0x06_{16}$, respectively. The register unit will use these indices to output their values. The value of `rd` will be taken by the IDEX pipeline register in preparation for write-back.
### EXE

Because the type of this instruction is R-type, the mux will select the values in `rs1` and `rs2` to send to the ALU. The ALU will then output the bitwise `and` operation result of `0x000080c2` and `0x00ff00ff`.
### MEM

Because this instruction will not write data into the data memory, `Wr_en` is set to false. However, the data memory still takes `Addr` and outputs its value.
### WB

In this stage, the mux will select the correct value and place it into the registers unit. At the same time, `0x06` also becomes the `Wr idx` in the registers unit, writing `0x000000c2` to the `x6` register.
### Hazard
- Structure hazard
A limitation of the structure is evident in load instructions. For example, a load instruction attempts to fetch the instruction and memory data simultaneously, but the memory design in RISC-V can't handle this operation. To solve this problem, RISC-V divides memory into instruction memory and data memory. This way, each memory block handles only one address and outputs one piece of data at a time.
- Control hazard
Uncertainty in instruction execution arises with branch instructions. In a RISC-V CPU, it is not certain whether instructions following a branch instruction will be executed until the EX stage. To address this, RISC-V implements a flush mechanism. If a branch instruction necessitates branching, it will flush data in the IF and ID stages and fetch new instructions again.
Take `2ec: e89ff0ef jal x1 -376 <gcd>` in main for example
```=
2ec: e89ff0ef jal x1 -376 <gcd>
2f0: 00012083 lw x1 0 x2
2f4: 00412283 lw x5 4 x2
2f8: 00812303 lw x6 8 x2
2fc: 00c12383 lw x7 12 x2
300: 01012e03 lw x28 16 x2
```




After the EXE stage, `2f0: 00012083 lw x1 0 x2` and `2f4: 00412283 lw x5 4 x2` are flushed and replaced with `nop`
- Data hazard
Can't reach newest data.
Take `b4: 0072f2b3 and x5 x5 x7` in `reverse_bits` for example.
```=
b0: 00155293 srli x5 x10 1
b4: 0072f2b3 and x5 x5 x7
b8: 00757333 and x6 x10 x7
bc: 00131313 slli x6 x6 1
c0: 00536533 or x10 x6 x5
```
Correct value of `x5` will be caculated after EXE stage of `b0: 00155293 srli x5 x10 1`.
But when `and x5 x5 x7` in EXE stage, correct value of `x5` still in MEM stage.
To solve this problem, RISC-V implement forwarding to make sure EXE stage always get newest data.

If source of current instruction get by MEM stage of previous instruction,
RISC-V will insert a `nop(stall)` to delay pipeline and make sure current instruction will get correct value.
Take `2c0: 000f8513 addi x10 x31 0` in MAIN for example.
```=
2bc: 0002af83 lw x31 0 x5
2c0: 000f8513 addi x10 x31 0
2c4: 00032f83 lw x31 0 x6
2c8: 000f8593 addi x11 x31 0
```



But If insert a no-relative instruction between load instruction and use instruction , can avoid `stall`.
For example, for loop in `MAIN:`
```asm=
MAIN_FOR:
lw t6, 0(t0) # test_data1[t0]
mv a0, t6
lw t6, 0(t1) # test_data2[t1]
mv a1, t6
# caller save
addi sp, sp, -28
sw ra, 0(sp)
sw t0, 4(sp)
sw t1, 8(sp)
sw t2, 12(sp)
sw t3, 16(sp)
sw t4, 20(sp)
sw t5, 24(sp)
jal ra, gcd
# reterive caller save
lw ra, 0(sp)
lw t0, 4(sp)
lw t1, 8(sp)
lw t2, 12(sp)
lw t3, 16(sp)
lw t4, 20(sp)
lw t5, 24(sp)
addi sp, sp, 28
lw t6, 0(t2) # golden[t2]
bne a0, t6, IF_ERROR # if (gcd(test_data1[i], test_data2[i]) != golden[i])
j END_IF_ERROR
IF_ERROR:
li t3, 0x0
addi t0, t0, 4 # t0 point to test_data1[t0+1]
addi t1, t1, 4 # t1 point to test_data2[t1+1]
addi t2, t2, 4 # t2 point to golden[t2+1]
addi t5, t5, 0x1 # i++
blt t5, t4, MAIN_FOR # i <= 19
END_MAIN_FOR:
```
It load memory value to `t6` and move it to `a` resisters. At bottom of the for loop
```asm=
addi t0, t0, 4 # t0 point to test_data1[t0+1]
addi t1, t1, 4 # t1 point to test_data2[t1+1]
addi t2, t2, 4 # t2 point to golden[t2+1]
```
It can be moved between load and move instruction
```asm=
MAIN_FOR:
lw t6, 0(t0) # test_data1[t0]
addi t0, t0, 4 # t0 point to test_data1[t0+1]
mv a0, t6
lw t6, 0(t1) # test_data2[t1]
addi t1, t1, 4 # t1 point to test_data2[t1+1]
mv a1, t6
# caller save
addi sp, sp, -28
sw ra, 0(sp)
sw t0, 4(sp)
sw t1, 8(sp)
sw t2, 12(sp)
sw t3, 16(sp)
sw t4, 20(sp)
sw t5, 24(sp)
jal ra, gcd
# reterive caller save
lw ra, 0(sp)
lw t0, 4(sp)
lw t1, 8(sp)
lw t2, 12(sp)
lw t3, 16(sp)
lw t4, 20(sp)
lw t5, 24(sp)
addi sp, sp, 28
lw t6, 0(t2) # golden[t2]
addi t2, t2, 4 # t2 point to golden[t2+1]
bne a0, t6, IF_ERROR # if (gcd(test_data1[i], test_data2[i]) != golden[i])
j END_IF_ERROR
IF_ERROR:
li t3, 0x0
addi t5, t5, 0x1 # i++
blt t5, t4, MAIN_FOR # i <= 19
END_MAIN_FOR:
```





Because of `addi x5, x5, 4`, RISC-V CPU don't need to insert more `stall` to delay `addi, x10 x31, 0`.
||Before insert no relative instr.|After insert no relative instr.|
|-|-|-|
|Cycles|16020|15960|
|Instrs. retired|12760|12760|
|CPI|1.26|1.25|
|IPC|0.797|0.799|
|Clock rate|8.57 KHz|5.96 KHz|
Number of required cycles decrease.