# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < [`Hotmercury`](https://github.com/Hotmercury) >
we can find instruction [reference](https://riscv.org//wp-content/uploads/2017/05/riscv-spec-v2.2.pdf)
[code on github](https://github.com/HotMercury/riscv-multiplicator-no-mul-instruction)
Find the position 0 the most close to LSB, and then make the position left to this postion become 0.
we can use this to compute the carry flag.
:::success
example
mask = 1010 0111
1010 0111 -> 0000 0111
and we can use (mask << 1) xor (mask) will get cary bit
-> 0000 1000
:::
```c
uint64_t mask_lowest_zero(uint64_t x)
{
uint64_t mask = x;
mask &= (mask << 1) | 0x1;
mask &= (mask << 2) | 0x3;
mask &= (mask << 4) | 0xF;
mask &= (mask << 8) | 0xFF;
mask &= (mask << 16) | 0xFFFF;
mask &= (mask << 32) | 0xFFFFFFFF;
return mask;
}
```
use ori immidiate only has 12 bit

$$ mask \ bigger \ than \ 2^{11} \ must \ to \ use \ register $$
```asm
.data
# dword -> 8 bytes
test_1: .dword 0x10FFFFFFFFFF3333
.text
main:
la t0, test_1 # lui t0, test_1[31:12] # lw t0, test_1[11:0]
lw a0, 0(t0)
lw a1, 4(t0)
jal mask_lowest_zero
li a7, 10 # return 0
ecall
mask_lowest_zero:
# a0 low , a1 high
# mask &= (mask << 1) | 1;
# a1,a0 = 64 bits parameter
slli t1, a1, 1
srli t0, a0, 31
or t1, t1, t0 # t1,t0
slli t0, a0, 1 # t0 = a0 << 1
ori t0, t0, 1 # x = x | 1
and a0, a0, t0
and a1, a1, t1
# mask &= (mask << 2) | 0x3;
slli t1, a1, 2
srli t0, a0, 30
or t1, t1, t0 # left 32 bits
slli t0, a0, 2 # t0 = a0 << 2
ori t0, t0, 3 # x = x | 3
and a0, a0, t0
and a1, a1, t1
# mask &= (mask << 4) | 0xF;
slli t1, a1, 4
srli t0, a0, 28
or t1, t1, t0 # left 32 bits
slli t0, a0, 4 # t0 = a0 << 4
ori t0, t0, 0xF # x = x | 0xF
and a0, a0, t0
and a1, a1, t1
# mask &= (mask << 8) | 0xFF;
slli t1, a1, 8
srli t0, a0, 24
or t1, t1, t0 # left 32 bits
slli t0, a0, 8 # t0 = a0 << 8
ori t0, t0, 0xFF # x = x | 0xFF
and a0, a0, t0
and a1, a1, t1
# mask &= (mask << 16) | 0xFFFF;
li t3 , 0xFFFF # lui + addi
slli t1, a1, 16
srli t0, a0, 16
or t1, t1, t0 # left 32 bits
slli t0, a0, 16 # t0 = a0 << 16
or t0, t0, t3 # x = x | 0xFFFF
and a0, a0, t0
and a1, a1, t1
# mask &= (mask << 32) | 0xFFFFFFFF;
and a1, a1,a0
```
x = x + 1
**flow**
1. test if overflow
2. find the carry flag
3. return flag & another bit
```clike
int64_t inc(int64_t x)
{
// x = all one bits will overflow to 0
if (~x == 0)
return 0;
// set carry flag
int64_t mask = mask_lowest_zero(x);
// 0011 -> 0100
int64_t z1 = mask ^ ((mask << 1) | 1);
return (x & ~mask) | z1;
}
```
```asm
inc:
addi sp, sp -12
sw ra, 0(sp)
sw a0, 4(sp)
sw a1, 8(sp) # save parameters
jal mask_lowest_zero
mv t0, a0
mv t1, a1 # t1,t0 mask
lw a0, 4(sp)
lw a1, 8(sp) # restore parameters
slli t3, t1, 1
srli t2, t0, 31
or t3, t3, t2
slli t2, t0, 1 # a1,a0 << 1
ori t2, t2, 1 # a1,a0 | 1
xor t2, t2, t0
xor t3, t3, t1 # t2,t3 z1
not t1, t1
not t0, t0 # ~mask
and a1, a1, t1
and a0, a0, t0 # a1,a0 & ~mask
or a1, a1, t3
or a0, a0, t2 # a1,a0 | z1
lw ra, 0(sp)
addi sp, sp, 12
ret
```
find the value of nth bit
```c
static inline int64_t getbit(int64_t value, int n)
{
return (value >> n) & 1;
}
```
```asm
getbit:
addi sp, sp, -8
sw ra, 0(sp)
sw s0, 4(sp)
li s0, 31
bge a3, s0, getbit_l # if (pos >= 32);
srl a0, a0, a3
andi a0, a0, 1
j getbit_end
getbit_l:
sub s0, a3, s0
srl a1, a1, s0
andi a1, a1, 1
mv a0, a1
getbit_end:
lw ra, 0(sp)
lw s0, 4(sp)
addi sp, sp, 16
ret
```
multiplicand multiplier = result
**flow**
1. get specify bit of b
2. if(1) result + a << i
```clike
int64_t imul32(int32_t a, int32_t b)
{
int64_t r = 0, a64 = (int64_t) a, b64 = (int64_t) b;
for (int i = 0; i < 32; i++) {
if (getbit(b64, i))
r += a64 << i;
}
return r;
}
```
```asm
imul32:
addi sp, sp, -28
sw ra,0(sp)
sw s0,4(sp)
sw s1,8(sp)
sw s2,12(sp)
sw s3,16(sp)
sw s4,20(sp)
sw s5,24(sp)
mv s0, a0 # a
mv s1, a1 # b
li s2, 0
li s3, 0 # result s3,s2
li s4, 0 # i counter
li s5, 32 # loop bound
imul32_loop:
beq s4, s5, imul32_end
mv a0, a1 # a0 = b
# a1 dont care
mv a2, s4 # a2 = i
jal getbit
beq a0, zero, imul_skip # if (getbit(b, i))
sub t2, s5, s4 # t2 = 32 - i
mv t0, s0 # t0 = a
srl t1, t0, t2 # shift left i
sll t0, t0, s4 # shift left i
slli t3, s2, 31 # overflow check
slli t4, t0, 31 # overflow check
and t5, t3, t4 # overflow check
beq t5, zero, 8 # if (overflow)
addi s3, s3, 1 # result++
add s2, s2, t0
add s3, s3, t1 # r += a << i
imul_skip:
addi s4, s4, 1 # i++
j imul32_loop
imul32_end:
mv a0, s2
mv a1, s3
lw ra,0(sp)
lw s0,4(sp)
lw s1,8(sp)
lw s2,12(sp)
lw s3,16(sp)
lw s4,20(sp)
lw s5,24(sp)
addi sp,sp,24
ret
```
merge result and b -> result(32),b(32)
so we dont care overflow by use risc32 to implement 64 bit
```clike
// improve int32 multiply
int64_t imul32_improve(int32_t a, int32_t b)
{
int64_t r = (int64_t)b;
int64_t a64 = (int64_t)a;
for (int i = 0; i < 32; i++){
if(r & 1){
r = r + (a64 << 32);
}
r = r >> 1;
}
return r;
}
```
**float32 multiply**
**flow**
1. transform tp IEEE
2. decide sign bit
3. find the mantissa and add another 1 23 + 1
4. exponent
5. use imul32 compute the multiply of mantissa of two number
6. 24 * 24 most 48 bit - 23 = 25 we only perserve 24 bit,sowe need to check another shift `int mshift = getbit(mrtmp, 24);` will get the position of 25 from LSB
7. `int32_t er = mshift ? inc(ertmp) : ertmp;` if had shift that we should add another 1 to exponent
8. conbime new IEEE
```c
float fmul32(float a, float b)
{
/* TODO: Special values like NaN and INF */
int32_t ia = *(int32_t *) &a, ib = *(int32_t *) &b;
/* sign */
int sa = ia >> 31;
int sb = ib >> 31;
/* mantissa */
int32_t ma = (ia & 0x7FFFFF) | 0x800000;
int32_t mb = (ib & 0x7FFFFF) | 0x800000;
/* exponent */
int32_t ea = ((ia >> 23) & 0xFF);
int32_t eb = ((ib >> 23) & 0xFF);
/* 'r' = result */
int64_t mrtmp = imul32(ma, mb) >> 23;
int mshift = getbit(mrtmp, C01);
int64_t mr = mrtmp >> mshift;
int32_t ertmp = ea + eb - C02;
int32_t er = mshift ? inc(ertmp) : ertmp;
/* TODO: Overflow ^ */
int sr = sa ^ sb;
int32_t r = (sr << C03) | ((er & 0xFF) << C04) | (mr & 0x7FFFFF);
return *(float *) &r;
}
```
```asm
fmul32:
addi sp, sp, -24
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
sw s2, 12(sp)
sw s3, 16(sp)
sw s4, 20(sp)
srli s0, a0, 31
srli s1, a1, 31
xor s0, s0, s1 # s0 = sign_a ^ sign_b
li t0, 0x7FFFFF
li t1, 0x800000
and s1, a0, t0
or s1, s1, t1 # s1 = mantissa_a
and s2, a1, t0
or s2, s2, t1 # s2 = mantissa_b
srli s3, a0, 23
andi s3, s3, 0xFF # s3 = exp_a
srli s4, a1, 23
andi s4, s4, 0xFF # s4 = exp_b
mv a0, s1
mv a1, s2
jal imul32
mv s1, a0
mv s2, a1 # s1,s2 = mantissa_a * mantissa_b
srli s1, s1, 23
slli s2, s2, 9
or s1, s1, s2 # s1 = mantissa_a * mantissa_b >> 23
# s2 dont care
mv a0, s1 # a0 = mantissa_a * mantissa_b >> 23
li a2, 24 # a1 = 24
jal getbit
srl s1, s1, a0 # s1 = mantissa_a * mantissa_b >> 23 >> getbit
add s3, s3, s4 # s3 = exp_a + exp_b
addi s3, s3, -127
# s4 dont care
# int32_t er = mshift ? inc(ertmp) : ertmp;
# skip inc
add s3, s3, a0 # s3 = er
srli s0, s0, 31 # s0 = (sr << 31)
andi s3, s3, 0xFF # s3 = er
slli s3, s3, 23 # s3 = er << 23
li t0, 0x7FFFFF
and s1, s1, t0 # s1 = mr
or s0, s0, s3 # s0 = (sr << 31) | (er << 23)
or s0, s0, s1 # s0 = (sr << 31) | (er << 23) | mr
mv a0, s0
lw ra, 0(sp)
lw s0, 4(sp)
lw s1, 8(sp)
lw s2, 12(sp)
lw s3, 16(sp)
lw s4, 20(sp)
addi sp, sp, 24
ret
```
### Analyze
- what is CPI ?
Below is the mathematical calculation for Cycle Per Instruction (CPI), so a higher CPI is considered better.
$$ CPI = \frac{\sum_i(IC_i)(CC_i)}{IC} $$
- What is IPC ?
Instructions per cycle
higher is better
When calculating floating-point multiplication, we need to compute the sign bit, exponent, and mantissa. This calculation involves multiplying the mantissas of two floating-point numbers, which means using two 23-bit segments from the IEEE representation, denoted as |1|8|23|. Upon observing the actual operation using a simulator, we identified that the performance bottleneck primarily lies in the mantissa calculation. The two diagrams below show the original and improved data, demonstrating a significant reduction of nearly 50% in cycles.


**difference**
**original code**
```asm
imul32_loop:
beq s4, s5, imul32_end
mv a0, a1 # a0 = b
mv a2, s4 # a2 = i
jal getbit
beq a0, zero, imul_skip # if (getbit(b, i))
sub t2, s5, s4 # t2 = 32 - i
mv t0, s0 # t0 = a
srl t1, t0, t2 # shift left i
sll t0, t0, s4 # shift left i
slli t3, s2, 31 # overflow check
slli t4, t0, 31 # overflow check
and t5, t3, t4 # overflow check
beq t5, zero, 8 # if (overflow)
addi s3, s3, 1 # result++
add s2, s2, t0
add s3, s3, t1 # r += a << i
imul_skip:
addi s4, s4, 1 # i++
j imul32_loop
```
1. Calling the `getbit` function using `jal getbit` results in a significant cycle overhead. By observing this pipeline, we can also note that when we use `jal`, it introduces two additional NOP instructions.


And clear will happen

2. When `getbit` is true, we need to perform the operation `r += a64 << i;`. We first shift `a64` left by `i` and then add it to `r`. However, this operation introduces a potential issue - register overflow.
In our simulation, we are using the RV32 architecture, which operates with 32-bit principles. But considering that a 32-bit multiplication can result in a 64-bit value, we need to use two registers to accommodate this. This introduces the overflow concern. When performing addition on the lower 32 bits, it may result in a carry, effectively incrementing the value in the higher bits of the register. Hence, we see various overflow checks in the RISC-V code mentioned above.
These checks ensure that we handle potential overflow scenarios appropriately to maintain the integrity of our calculations.
example
:::success
let proccesor rv4
when 1001(a1) 1001(a0) + 0100(t1) 1000(t0)
we need to record that (a0 + t0) has carry bit and add sum(a1,t1,0)
:::

3. reodering the instructions
For example, in the following code, instructions have been deliberately reordered. This can be an effective way to avoid "Read after Write" hazards. However, through observation, we can see that this doesn't have an impact because data forwarding can resolve this issue. You can notice that the gray selectors differ, indicating that the value in t3 comes from the calculation in the previous pipeline stage.
*order*
```asm
srli s1, s1, 1 # result >> 1
slli t3, s2, 31 # result << (32 - 1)
or s1, s1, t3 # result | (result << (32 - 1))
srli s2, s2, 1 # result >> 1
```


*reorder*
```asm
srli s1, s1, 1 # result >> 1
slli t3, s2, 31 # result << (32 - 1)
srli s2, s2, 1 # result >> 1
or s1, s1, t3 # result | (result << (32 - 1))
```


**improve**
```asm
imul32_loop:
beq t0, t1, imul32_end
andi t2, s1, 1 # if (result & 1)
beq t2, zero, imul32_skip
add s2, s2, s0 # result += a
imul32_skip:
srli s1, s1, 1 # result >> 1
slli t3, s2, 31 # result << (32 - 1)
srli s2, s2, 1 # result >> 1
or s1, s1, t3 # result | (result << (32 - 1))
addi t0, t0, 1 # i++
j imul32_loop
```
### Hazard
various hazard example
- structural hazard
- two instructions try to write to the register at the same time
- Data Hazards
- Control hazards
### problem
branch often happen,and will come with two nop, can we avoid it?
