# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < [00853029](https://github.com/00853029) >
> [respository](https://github.com/00853029/Computer_Architecture/tree/main/hw1)
## Convert RGB image into grayscale by using RV32I ISA
### Description

- The RGB values are converted to grayscale using the NTSC formula:
$$ (0.299*Red) + (0.587*Green) + (0.114*Blue) $$
- This assignment project involves performing floating-point multiplication and addition operations on an image represented in RGB format, following a fixed ratio, to create a new grayscale image.
- The main challenge in this project is that it can only be accomplished using RV32I instructions, without the availability of floating-point addition and multiplication operations.
- Use Quiz 1's `Problem C` to implement
## Implementation
### Idea
- By splitting the bits into sign, exponent, and mantissa according to the IEEE 754 floating-point format, we can perform integer operations on each of them.
- For the multiplication part, we can utilize shifting methods for calculations.
### C code
- main function - Using `unsigned_fadd32` and `fmul` functions to replace floating-point addition and multiplication operations.
#### First version
```clike=
#include <stdio.h>
#include <stdint.h>
void swap(int32_t *x, int32_t *y){
int32_t t = *y;
*y = *x;
*x = t;
return;
}
uint32_t mask_lowest_zero(uint32_t x)
{
uint32_t mask = x;
mask &= (mask << 1) | 0x1;
mask &= (mask << 2) | 0x3;
mask &= (mask << 4) | 0xF;
mask &= (mask << 8) | 0xFF;
mask &= (mask << 16) | 0xFFFF;
//mask &= (mask << 32) | 0xFFFFFFFF;
return mask;
}
int32_t inc(int32_t x)
{
if (~x == 0)
return 0;
int32_t mask = mask_lowest_zero(x);
int32_t z1 = mask ^ ((mask << 1) | 1);
return (x & ~mask) | z1;
}
static inline int32_t getbit(int32_t value, int n)
{
return (value >> n) & 1;
}
/* int32 multiply */
int32_t imul32(int32_t a, int32_t b)
{
int32_t r = 0;
while(b != 0){
if (b & 1){
r += a;
}
b = b >> 1;
r = r >> 1;
}
r = r << 1;
return r;
}
uint32_t count_leading_zeros(uint32_t x) {
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
/* count ones (population count) */
x -= ((x >> 1) & 0x55555555);
x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
x = ((x >> 4) + x) & 0x0F0F0F0F;
x += (x >> 8);
x += (x >> 16);
return (32 - (x & 0x7F));
}
float unsigned_fadd32(float a,float b){
int32_t ia = *(int32_t *)&a, ib = *(int32_t *)&b;
int32_t a_tmp = ia & 0x7FFFFFFF;
int32_t b_tmp = ib & 0x7FFFFFFF;
if (a_tmp < b_tmp)
swap(&ia, &ib);
/* mantissa */
int32_t ma = ia & 0x7FFFFF | 0x800000;
int32_t mb = ib & 0x7FFFFF | 0x800000;
/* exponent */
int32_t ea = (ia >> 23) & 0xFF;
int32_t eb = (ib >> 23) & 0xFF;
int32_t align = (ea - eb > 24) ? 24 : (ea - eb);
mb = mb >> align;
if ((ia ^ ib) >> 31) {
ma -= mb;
} else {
ma += mb;
}
int32_t clz = count_leading_zeros(ma);
int32_t shift = 0;
if (clz <= 8) {
shift = 8 - clz;
ma >>= shift;
ea += shift;
} else {
shift = clz - 8;
ma <<= shift;
ea -= shift;
}
int32_t r = ia & 0x80000000 | ea << 23 | ma & 0x7FFFFF;
return *(float *) &r;
}
/* float32 multiply */
float fmul32(float a, float b)
{
int32_t ia = *(int32_t *) &a, ib = *(int32_t *) &b;
/* sign */
int sa = ia >> 31;
int sb = ib >> 31;
/* mantissa */
int32_t ma = (ia & 0x7FFFFF) | 0x800000;
int32_t mb = (ib & 0x7FFFFF) | 0x800000;
/* exponent */
int32_t ea = ((ia >> 23) & 0xFF);
int32_t eb = ((ib >> 23) & 0xFF);
/* 'r' = result */
int32_t mrtmp = imul32(ma, mb);
int mshift = getbit(mrtmp, 24);
int32_t mr = mrtmp >> mshift;
int32_t ertmp = ea + eb - 127;
int32_t er = mshift ? inc(ertmp) : ertmp;
int sr = sa ^ sb;
int32_t r = (sr << 31) | ((er & 0xFF) << 23) | (mr & 0x7FFFFF);
return *(float *) &r;
}
int main(){
float image[3][3][3] = {{{0.90251149,0.03265091,0.8831173},{0.2139775,0.0737501,0.0399187},{0.21527551,0.8881527,0.7846363}},
{{0.938326,0.64254336,0.0461617},{0.1413221,0.3307385,0.2508785},{0.3833867,0.689476,0.41071482}},
{{0.8925364,0.1480669,0.6812473},{0.9288288,0.23190344,0.3070017},{0.6414362,0.34707349,0.5142535}}};
float grayscale_image[3][3];
for(int i=0;i<3;i=i+1){
for(int j=0;j<3;j=j+1){
grayscale_image[i][j] = unsigned_fadd32(unsigned_fadd32(fmul32(image[i][j][0], 0.299) , fmul32(image[i][j][1], 0.587)) , fmul32(image[i][j][2], 0.114));
}
}
for(int i=0;i<3;i=i+1){
for(int j=0;j<3;j=j+1){
printf("%f ",grayscale_image[i][j]);
}
printf("\n");
}
}
// Result:
// 0.389692 0.111821 0.675161
// 0.662995 0.264999 0.566176
// 0.431446 0.448845 0.454146
```
### Rewrite the `imul` function in `Problem C`
- After adding one hidden bit to the mantissa, the multiplication of the two will yield a total of 48 bits. However, the register is only 32 bits, so it's necessary to right-shift one bit after each addition to prevent overflow in the result.
```clike=
int32_t imul32(int32_t a, int32_t b)
{
int32_t r = 0;
while(b != 0){
if (b & 1){
r = r + a;
}
b = b >> 1;
r = r >> 1;
}
r = r << 1;
return r;
}
```
#### second version
- In `fmul` function, remove `inc` and `mask_lowest_zero` functions.
- Replace by adding `mshift` variable
```clike=
#include <stdio.h>
#include <stdint.h>
void swap(int32_t *x, int32_t *y){
int32_t t = *y;
*y = *x;
*x = t;
return;
}
static inline int32_t getbit(int32_t value, int n)
{
return (value >> n) & 1;
}
/* int32 multiply */
int32_t imul32(int32_t a, int32_t b)
{
int32_t r = 0;
while(b != 0){
if (b & 1){
r += a;
}
b = b >> 1;
r = r >> 1;
}
r = r << 1;
return r;
}
uint32_t count_leading_zeros(uint32_t x) {
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
/* count ones (population count) */
x -= ((x >> 1) & 0x55555555);
x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
x = ((x >> 4) + x) & 0x0F0F0F0F;
x += (x >> 8);
x += (x >> 16);
return (32 - (x & 0x7F));
}
float unsigned_fadd32(float a,float b){
int32_t ia = *(int32_t *)&a, ib = *(int32_t *)&b;
int32_t a_tmp = ia & 0x7FFFFFFF;
int32_t b_tmp = ib & 0x7FFFFFFF;
if (a_tmp < b_tmp)
swap(&ia, &ib);
/* mantissa */
int32_t ma = ia & 0x7FFFFF | 0x800000;
int32_t mb = ib & 0x7FFFFF | 0x800000;
/* exponent */
int32_t ea = (ia >> 23) & 0xFF;
int32_t eb = (ib >> 23) & 0xFF;
int32_t align = (ea - eb > 24) ? 24 : (ea - eb);
mb >>= align;
if ((ia ^ ib) >> 31) {
ma -= mb;
} else {
ma += mb;
}
int32_t clz = count_leading_zeros(ma);
int32_t shift = 0;
if (clz <= 8) {
shift = 8 - clz;
ma >>= shift;
ea += shift;
} else {
shift = clz - 8;
ma <<= shift;
ea -= shift;
}
int32_t r = ia & 0x80000000 | ea << 23 | ma & 0x7FFFFF;
return *(float *) &r;
}
/* float32 multiply */
float fmul32(float a, float b)
{
int32_t ia = *(int32_t *) &a, ib = *(int32_t *) &b;
/* sign */
int sa = ia >> 31;
int sb = ib >> 31;
/* mantissa */
int32_t ma = (ia & 0x7FFFFF) | 0x800000;
int32_t mb = (ib & 0x7FFFFF) | 0x800000;
/* exponent */
int32_t ea = ((ia >> 23) & 0xFF);
int32_t eb = ((ib >> 23) & 0xFF);
/* 'r' = result */
int32_t mrtmp = imul32(ma, mb);
int mshift = getbit(mrtmp, 24);
int32_t mr = mrtmp >> mshift;
int32_t ertmp = ea + eb - 127;
// int32_t er = mshift ? inc(ertmp) : ertmp;
int32_t er = mshift + ertmp;
int sr = sa ^ sb;
int32_t r = (sr << 31) | ((er & 0xFF) << 23) | (mr & 0x7FFFFF);
return *(float *) &r;
}
int main(){
float image[3][3][3] = {{{0.90251149,0.03265091,0.8831173},{0.2139775,0.0737501,0.0399187},{0.21527551,0.8881527,0.7846363}},
{{0.938326,0.64254336,0.0461617},{0.1413221,0.3307385,0.2508785},{0.3833867,0.689476,0.41071482}},
{{0.8925364,0.1480669,0.6812473},{0.9288288,0.23190344,0.3070017},{0.6414362,0.34707349,0.5142535}}};
float grayscale_image[3][3];
for(int i=0;i<3;i=i+1){
for(int j=0;j<3;j=j+1){
grayscale_image[i][j] = unsigned_fadd32(unsigned_fadd32(fmul32(image[i][j][0], 0.299) , fmul32(image[i][j][1], 0.587)) , fmul32(image[i][j][2], 0.114));
}
}
for(int i=0;i<3;i=i+1){
for(int j=0;j<3;j=j+1){
printf("%f ",grayscale_image[i][j]);
}
printf("\n");
}
}
// ANS:
// 0.389692 0.111821 0.675161
// 0.662995 0.264999 0.566176
// 0.431446 0.448845 0.454146
```
### Assembly code
- [Github respository](https://github.com/00853029/Computer_Architecture/tree/main/hw1)
### Performance
- Reduce 1305 clock cycles.
- Clock cylces run by Ripes:
- First version:
<img src="https://hackmd.io/_uploads/ryp9a6ZZT.png" width=40%></img>
- Second version:
<img src="https://hackmd.io/_uploads/B1i_sa-ZT.png" width=40%></img>
- Cache
- First version:
- <img src="https://hackmd.io/_uploads/H12rvRZW6.png" width=50%></img><img src="https://hackmd.io/_uploads/HkacFyM-a.png" width=50%></img>
- Second version:
- <img src="https://hackmd.io/_uploads/HkcGtAWZa.png" width=50%></img><img src="https://hackmd.io/_uploads/By5HtAb-a.png" width=50%></img>
## 5-stage pipelined processor
#### processor screenshot when cycle=157

- Let's observe what changes occur in each stage and the registers for the `add x5 x5 x8` instruction over the next 5 clock cycles.
### Instruction fetch (IF stage)
- `add x5 x5 x8` instruction is located at address 0x0000012c in the instruction memory.
- mechine code about `add x5 x5 x8` is 0x008282b3.

### Instruction decode (ID stage)
- Read two source registers from Register File - `x5` register : 0x000ca299,`x8` register : 0x00e70afe
- <img src="https://hackmd.io/_uploads/Skb0Z1GWp.png" width=40%></img>
### Execute (EXE stage)
- ALU Src1 value is 0x000ca299 : `x5`
- ALU Src2 value is 0x00e70afe : `x8`
- ALU output : 0x00f3ad97
- ALU output = ALU Src1 + ALU Src2

### Memory access (MEM stage)
- The `add` instruction does not allow memory read or write, so in the MEM-stage, it doesn't perform any actions.
- Pass ALU output to next stage(WB-stage)
- DM output : 0x0

### Write back (WB stage)
- Select 0x00f3ad97(ALU output) to write back to Register File.
- Update the register `x5`'s value.
<img src="https://hackmd.io/_uploads/ryG8NJzZ6.png" width=40%></img>
## Result verification
- [Source code-Colab](https://colab.research.google.com/drive/1LKFWfMu1FZk8eC-VGY-jGOZpZ6io9Wmu?usp=sharing)
- Using python-matplotlib to verify the answers and display both the original RGB image and the computed grayscale image.
```python
import matplotlib.pyplot as plt
import numpy as np
def rgb2gray(rgb):
r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
gray = 0.299 * r + 0.5870 * g + 0.1140 * b
return gray
image_rgb = np.array([[[0.90251149,0.03265091,0.8831173],[0.2139775,0.0737501,0.0399187],[0.21527551,0.8881527,0.7846363]],
[[0.938326,0.64254336,0.0461617],[0.1413221,0.3307385,0.2508785],[0.3833867,0.689476,0.41071482]],
[[0.8925364,0.1480669,0.6812473],[0.9288288,0.23190344,0.3070017],[0.6414362,0.34707349,0.5142535]]])
plt.axis("off")
plt.imshow(image_rgb)
plt.show()
```
- Original RGB image & the result computed by numpy
- <img src='https://hackmd.io/_uploads/ry6v6n-Wa.png' width='40%'></img><img src='https://hackmd.io/_uploads/BkC9p3Z-6.png' width='40%'></img>
- Grayscale image computed by RV32I assembly code
- <img src='https://hackmd.io/_uploads/SyUWJ6WWp.png' width='40%'></img>
- We can see that the two grayscale-images are almost identical.
- Ripes outputs (The outputs in First and second version are same.)
- 