Assignment1: RISC-V Assembly and Instruction Pipeline

# Assignment1: RISC-V Assembly and Instruction Pipeline contributed by < [00853029](https://github.com/00853029) > > [respository](https://github.com/00853029/Computer_Architecture/tree/main/hw1) ## Convert RGB image into grayscale by using RV32I ISA ### Description ![](https://hackmd.io/_uploads/SkKFzC-Zp.png) - The RGB values are converted to grayscale using the NTSC formula: $$ (0.299*Red) + (0.587*Green) + (0.114*Blue) $$ - This assignment project involves performing floating-point multiplication and addition operations on an image represented in RGB format, following a fixed ratio, to create a new grayscale image. - The main challenge in this project is that it can only be accomplished using RV32I instructions, without the availability of floating-point addition and multiplication operations. - Use Quiz 1's `Problem C` to implement ## Implementation ### Idea - By splitting the bits into sign, exponent, and mantissa according to the IEEE 754 floating-point format, we can perform integer operations on each of them. - For the multiplication part, we can utilize shifting methods for calculations. ### C code - main function - Using `unsigned_fadd32` and `fmul` functions to replace floating-point addition and multiplication operations. #### First version ```clike= #include <stdio.h> #include <stdint.h> void swap(int32_t *x, int32_t *y){ int32_t t = *y; *y = *x; *x = t; return; } uint32_t mask_lowest_zero(uint32_t x) { uint32_t mask = x; mask &= (mask << 1) | 0x1; mask &= (mask << 2) | 0x3; mask &= (mask << 4) | 0xF; mask &= (mask << 8) | 0xFF; mask &= (mask << 16) | 0xFFFF; //mask &= (mask << 32) | 0xFFFFFFFF; return mask; } int32_t inc(int32_t x) { if (~x == 0) return 0; int32_t mask = mask_lowest_zero(x); int32_t z1 = mask ^ ((mask << 1) | 1); return (x & ~mask) | z1; } static inline int32_t getbit(int32_t value, int n) { return (value >> n) & 1; } /* int32 multiply */ int32_t imul32(int32_t a, int32_t b) { int32_t r = 0; while(b != 0){ if (b & 1){ r += a; } b = b >> 1; r = r >> 1; } r = r << 1; return r; } uint32_t count_leading_zeros(uint32_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); /* count ones (population count) */ x -= ((x >> 1) & 0x55555555); x = ((x >> 2) & 0x33333333) + (x & 0x33333333); x = ((x >> 4) + x) & 0x0F0F0F0F; x += (x >> 8); x += (x >> 16); return (32 - (x & 0x7F)); } float unsigned_fadd32(float a,float b){ int32_t ia = *(int32_t *)&a, ib = *(int32_t *)&b; int32_t a_tmp = ia & 0x7FFFFFFF; int32_t b_tmp = ib & 0x7FFFFFFF; if (a_tmp < b_tmp) swap(&ia, &ib); /* mantissa */ int32_t ma = ia & 0x7FFFFF | 0x800000; int32_t mb = ib & 0x7FFFFF | 0x800000; /* exponent */ int32_t ea = (ia >> 23) & 0xFF; int32_t eb = (ib >> 23) & 0xFF; int32_t align = (ea - eb > 24) ? 24 : (ea - eb); mb = mb >> align; if ((ia ^ ib) >> 31) { ma -= mb; } else { ma += mb; } int32_t clz = count_leading_zeros(ma); int32_t shift = 0; if (clz <= 8) { shift = 8 - clz; ma >>= shift; ea += shift; } else { shift = clz - 8; ma <<= shift; ea -= shift; } int32_t r = ia & 0x80000000 | ea << 23 | ma & 0x7FFFFF; return *(float *) &r; } /* float32 multiply */ float fmul32(float a, float b) { int32_t ia = *(int32_t *) &a, ib = *(int32_t *) &b; /* sign */ int sa = ia >> 31; int sb = ib >> 31; /* mantissa */ int32_t ma = (ia & 0x7FFFFF) | 0x800000; int32_t mb = (ib & 0x7FFFFF) | 0x800000; /* exponent */ int32_t ea = ((ia >> 23) & 0xFF); int32_t eb = ((ib >> 23) & 0xFF); /* 'r' = result */ int32_t mrtmp = imul32(ma, mb); int mshift = getbit(mrtmp, 24); int32_t mr = mrtmp >> mshift; int32_t ertmp = ea + eb - 127; int32_t er = mshift ? inc(ertmp) : ertmp; int sr = sa ^ sb; int32_t r = (sr << 31) | ((er & 0xFF) << 23) | (mr & 0x7FFFFF); return *(float *) &r; } int main(){ float image[3][3][3] = {{{0.90251149,0.03265091,0.8831173},{0.2139775,0.0737501,0.0399187},{0.21527551,0.8881527,0.7846363}}, {{0.938326,0.64254336,0.0461617},{0.1413221,0.3307385,0.2508785},{0.3833867,0.689476,0.41071482}}, {{0.8925364,0.1480669,0.6812473},{0.9288288,0.23190344,0.3070017},{0.6414362,0.34707349,0.5142535}}}; float grayscale_image[3][3]; for(int i=0;i<3;i=i+1){ for(int j=0;j<3;j=j+1){ grayscale_image[i][j] = unsigned_fadd32(unsigned_fadd32(fmul32(image[i][j][0], 0.299) , fmul32(image[i][j][1], 0.587)) , fmul32(image[i][j][2], 0.114)); } } for(int i=0;i<3;i=i+1){ for(int j=0;j<3;j=j+1){ printf("%f ",grayscale_image[i][j]); } printf("\n"); } } // Result: // 0.389692 0.111821 0.675161 // 0.662995 0.264999 0.566176 // 0.431446 0.448845 0.454146 ``` ### Rewrite the `imul` function in `Problem C` - After adding one hidden bit to the mantissa, the multiplication of the two will yield a total of 48 bits. However, the register is only 32 bits, so it's necessary to right-shift one bit after each addition to prevent overflow in the result. ```clike= int32_t imul32(int32_t a, int32_t b) { int32_t r = 0; while(b != 0){ if (b & 1){ r = r + a; } b = b >> 1; r = r >> 1; } r = r << 1; return r; } ``` #### second version - In `fmul` function, remove `inc` and `mask_lowest_zero` functions. - Replace by adding `mshift` variable ```clike= #include <stdio.h> #include <stdint.h> void swap(int32_t *x, int32_t *y){ int32_t t = *y; *y = *x; *x = t; return; } static inline int32_t getbit(int32_t value, int n) { return (value >> n) & 1; } /* int32 multiply */ int32_t imul32(int32_t a, int32_t b) { int32_t r = 0; while(b != 0){ if (b & 1){ r += a; } b = b >> 1; r = r >> 1; } r = r << 1; return r; } uint32_t count_leading_zeros(uint32_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); /* count ones (population count) */ x -= ((x >> 1) & 0x55555555); x = ((x >> 2) & 0x33333333) + (x & 0x33333333); x = ((x >> 4) + x) & 0x0F0F0F0F; x += (x >> 8); x += (x >> 16); return (32 - (x & 0x7F)); } float unsigned_fadd32(float a,float b){ int32_t ia = *(int32_t *)&a, ib = *(int32_t *)&b; int32_t a_tmp = ia & 0x7FFFFFFF; int32_t b_tmp = ib & 0x7FFFFFFF; if (a_tmp < b_tmp) swap(&ia, &ib); /* mantissa */ int32_t ma = ia & 0x7FFFFF | 0x800000; int32_t mb = ib & 0x7FFFFF | 0x800000; /* exponent */ int32_t ea = (ia >> 23) & 0xFF; int32_t eb = (ib >> 23) & 0xFF; int32_t align = (ea - eb > 24) ? 24 : (ea - eb); mb >>= align; if ((ia ^ ib) >> 31) { ma -= mb; } else { ma += mb; } int32_t clz = count_leading_zeros(ma); int32_t shift = 0; if (clz <= 8) { shift = 8 - clz; ma >>= shift; ea += shift; } else { shift = clz - 8; ma <<= shift; ea -= shift; } int32_t r = ia & 0x80000000 | ea << 23 | ma & 0x7FFFFF; return *(float *) &r; } /* float32 multiply */ float fmul32(float a, float b) { int32_t ia = *(int32_t *) &a, ib = *(int32_t *) &b; /* sign */ int sa = ia >> 31; int sb = ib >> 31; /* mantissa */ int32_t ma = (ia & 0x7FFFFF) | 0x800000; int32_t mb = (ib & 0x7FFFFF) | 0x800000; /* exponent */ int32_t ea = ((ia >> 23) & 0xFF); int32_t eb = ((ib >> 23) & 0xFF); /* 'r' = result */ int32_t mrtmp = imul32(ma, mb); int mshift = getbit(mrtmp, 24); int32_t mr = mrtmp >> mshift; int32_t ertmp = ea + eb - 127; // int32_t er = mshift ? inc(ertmp) : ertmp; int32_t er = mshift + ertmp; int sr = sa ^ sb; int32_t r = (sr << 31) | ((er & 0xFF) << 23) | (mr & 0x7FFFFF); return *(float *) &r; } int main(){ float image[3][3][3] = {{{0.90251149,0.03265091,0.8831173},{0.2139775,0.0737501,0.0399187},{0.21527551,0.8881527,0.7846363}}, {{0.938326,0.64254336,0.0461617},{0.1413221,0.3307385,0.2508785},{0.3833867,0.689476,0.41071482}}, {{0.8925364,0.1480669,0.6812473},{0.9288288,0.23190344,0.3070017},{0.6414362,0.34707349,0.5142535}}}; float grayscale_image[3][3]; for(int i=0;i<3;i=i+1){ for(int j=0;j<3;j=j+1){ grayscale_image[i][j] = unsigned_fadd32(unsigned_fadd32(fmul32(image[i][j][0], 0.299) , fmul32(image[i][j][1], 0.587)) , fmul32(image[i][j][2], 0.114)); } } for(int i=0;i<3;i=i+1){ for(int j=0;j<3;j=j+1){ printf("%f ",grayscale_image[i][j]); } printf("\n"); } } // ANS: // 0.389692 0.111821 0.675161 // 0.662995 0.264999 0.566176 // 0.431446 0.448845 0.454146 ``` ### Assembly code - [Github respository](https://github.com/00853029/Computer_Architecture/tree/main/hw1) ### Performance - Reduce 1305 clock cycles. - Clock cylces run by Ripes: - First version: <img src="https://hackmd.io/_uploads/ryp9a6ZZT.png" width=40%></img> - Second version: <img src="https://hackmd.io/_uploads/B1i_sa-ZT.png" width=40%></img> - Cache - First version: - <img src="https://hackmd.io/_uploads/H12rvRZW6.png" width=50%></img><img src="https://hackmd.io/_uploads/HkacFyM-a.png" width=50%></img> - Second version: - <img src="https://hackmd.io/_uploads/HkcGtAWZa.png" width=50%></img><img src="https://hackmd.io/_uploads/By5HtAb-a.png" width=50%></img> ## 5-stage pipelined processor #### processor screenshot when cycle=157 ![](https://hackmd.io/_uploads/BySEZJMWT.png) - Let's observe what changes occur in each stage and the registers for the `add x5 x5 x8` instruction over the next 5 clock cycles. ### Instruction fetch (IF stage) - `add x5 x5 x8` instruction is located at address 0x0000012c in the instruction memory. - mechine code about `add x5 x5 x8` is 0x008282b3. ![](https://hackmd.io/_uploads/SycLWJGZa.png) ### Instruction decode (ID stage) - Read two source registers from Register File - `x5` register : 0x000ca299，`x8` register : 0x00e70afe - <img src="https://hackmd.io/_uploads/Skb0Z1GWp.png" width=40%></img>![](https://hackmd.io/_uploads/SyboW1MbT.png) ### Execute (EXE stage) - ALU Src1 value is 0x000ca299 : `x5` - ALU Src2 value is 0x00e70afe : `x8` - ALU output : 0x00f3ad97 - ALU output = ALU Src1 + ALU Src2 ![](https://hackmd.io/_uploads/SyrXfyzbT.png) ### Memory access (MEM stage) - The `add` instruction does not allow memory read or write, so in the MEM-stage, it doesn't perform any actions. - Pass ALU output to next stage(WB-stage) - DM output : 0x0 ![](https://hackmd.io/_uploads/B1dr7Jzb6.png) ### Write back (WB stage) - Select 0x00f3ad97(ALU output) to write back to Register File. - Update the register `x5`'s value. ![](https://hackmd.io/_uploads/BkyVNkfba.png)<img src="https://hackmd.io/_uploads/ryG8NJzZ6.png" width=40%></img> ## Result verification - [Source code-Colab](https://colab.research.google.com/drive/1LKFWfMu1FZk8eC-VGY-jGOZpZ6io9Wmu?usp=sharing) - Using python-matplotlib to verify the answers and display both the original RGB image and the computed grayscale image. ```python import matplotlib.pyplot as plt import numpy as np def rgb2gray(rgb): r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2] gray = 0.299 * r + 0.5870 * g + 0.1140 * b return gray image_rgb = np.array([[[0.90251149,0.03265091,0.8831173],[0.2139775,0.0737501,0.0399187],[0.21527551,0.8881527,0.7846363]], [[0.938326,0.64254336,0.0461617],[0.1413221,0.3307385,0.2508785],[0.3833867,0.689476,0.41071482]], [[0.8925364,0.1480669,0.6812473],[0.9288288,0.23190344,0.3070017],[0.6414362,0.34707349,0.5142535]]]) plt.axis("off") plt.imshow(image_rgb) plt.show() ``` - Original RGB image & the result computed by numpy - <img src='https://hackmd.io/_uploads/ry6v6n-Wa.png' width='40%'></img><img src='https://hackmd.io/_uploads/BkC9p3Z-6.png' width='40%'></img> - Grayscale image computed by RV32I assembly code - <img src='https://hackmd.io/_uploads/SyUWJ6WWp.png' width='40%'></img> - We can see that the two grayscale-images are almost identical. - Ripes outputs (The outputs in First and second version are same.) - ![](https://hackmd.io/_uploads/SJpB5TbbT.png)