Try   HackMD

Image scaling with Bilinear interpolation by float32 multiplication

contributed by <linyu425>

Image scaling with Bilinear interpolation by float32

If we want to obtain the value of the unknown function f at the point

P=(x,y) , assuming we know the values of function f at four points
Q11=(x1,y1)
,
Q12=(x1,y2)
,
Q21=(x2,y1)
, and
Q22=(x2,y2)
.
First, perform linear interpolation in the x-direction,

f(x,y1)x2xx2x1f(Q11)+xx1x2x1f(Q21)

f(x,y1)x2xx2x1f(Q12)+xx1x2x1f(Q22)

, and then perform linear interpolation in the y-direction, resulting in

f(x,y)y2yy2y1f(R1)+yy1y2y1f(Q22)

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

To achieve the aforementioned goals, it is necessary to utilize floating-point multiplication and addition.

Implement

I use a two-dimensional array(im_2) to store a 2x2-sized image and eventually saved the enlarged result in a 5x5-sized two-dimensional array(im_5).

In this version, we need to enlarge a 2x2 image to a 5x5 image, so we must fill in 21 values.Each value needs to go through two fmul32 and one fadd32 calculations.

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

First, I calculate the values within the red box, and in this stage, each value is obtained through linear interpolation.

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

Then, we calculate the values in the blue box by bilinear interpolation with the values in the red box.

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

Example

We use a 2*2 grayscale image as an enlargement example.

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

The following image is the 5x5 enlarged grayscale image obtained based on the above steps.

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

C code

#include <stdio.h> #include <stdint.h> #define IN_N 2 #define OUT_N 5 uint32_t mask_lowest_zero(uint32_t x) { uint32_t mask = x; mask &= (mask << 1) | 0x1; mask &= (mask << 2) | 0x3; mask &= (mask << 4) | 0xF; mask &= (mask << 8) | 0xFF; mask &= (mask << 16) | 0xFFFF; return mask; } uint32_t count_leading_zeros(uint32_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); /* count ones (population count) */ x -= ((x >> 1) & 0x55555555); x = ((x >> 2) & 0x33333333) + (x & 0x33333333); x = ((x >> 4) + x) & 0x0f0f0f0f; x += (x >> 8); x += (x >> 16); return (32 - (x & 0x7f)); } int32_t inc(int32_t x) { if (~x == 0) return 0; int32_t mask = mask_lowest_zero(x); int32_t z1 = mask ^ ((mask << 1) | 1); return (x & ~mask) | z1; } static inline int32_t getbit(int32_t value, int n) { return (value >> n) & 1; } /* int32 multiply */ int32_t imul32(int32_t a, int32_t b) { int32_t r = 0; while(1) { if((b & 1) != 0) { r = r + a; } b = b >> 1; if(b == 0x0) break; r = r >> 1; } return r; } /* float32 multiply */ float fmul32(float a, float b) { int32_t ia = *(int32_t *) &a, ib = *(int32_t *) &b; /* sign */ int sa = ia >> 31; int sb = ib >> 31; /* mantissa */ int32_t ma = (ia & 0x7FFFFF) | 0x800000; int32_t mb = (ib & 0x7FFFFF) | 0x800000; /* exponent */ int32_t ea = ((ia >> 23) & 0xFF); int32_t eb = ((ib >> 23) & 0xFF); /* 'r' = result */ int32_t mrtmp = imul32(ma, mb); int mshift = getbit(mrtmp, 24); int32_t mr = mrtmp >> mshift; int32_t ertmp = ea + eb - 127; int32_t er = mshift ? inc(ertmp) : ertmp; int sr = sa ^ sb; int32_t r = (sr << 31) | ((er & 0xFF) << 23) | (mr & 0x7FFFFF); return *(float *) &r; } float fadd32(float a, float b) { int32_t ia = *(int32_t *)&a, ib = *(int32_t *)&b; int32_t temp; if (ia & 0x7fffffff < ib & 0x7fffffff){ temp = ia; ia = ib; ib = temp; } /* sign */ int sa = ia >> 31; int sb = ib >> 31; /* mantissa */ int32_t ma = ia & 0x7fffff | 0x800000; int32_t mb = ib & 0x7fffff | 0x800000; /* exponent */ int32_t ea = (ia >> 23) & 0xff; int32_t eb = (ib >> 23) & 0xff; int32_t ea = (ia >> 23) & 0xff; int32_t eb = (ib >> 23) & 0xff; int32_t align = (ea - eb > 24) ? 24 : (ea - eb); mb >>= align; if (sa | sb) ma -= mb; else ma += mb; int32_t clz = count_leading_zeros(ma); int32_t shift = 0; if (clz <= 8) { shift = 8 - clz; ma >>= shift; ea += shift; } else { shift = clz - 8; ma <<= shift; ea -= shift; } int32_t r = ia & 0x80000000 | ea << 23 | ma & 0x7fffff; return *(float *) &r; } int main() { float im_2[2][2] = {{0.95478,0.64721}, {0.823257,0.22245}}; float im_5[5][5] = { {0,0,0,0,0}, {0,0,0,0,0}, {0,0,0,0,0}, {0,0,0,0,0}, {0,0,0,0,0} } im_5[0][0] = im_2[0][0]; im_5[0][OUT_N-1] = im_2[0][IN_N-1]; im_5[OUT_N-1][0] = im_2[IN_N-1][0]; im_5[OUT_N-1][OUT_N-1] = im_2[IN_N-1][IN_N-1]; for(int i=1;i<4;i++){ im_5[0][i] = fadd32 (fmul32(im_5[0][0] , (float)(OUT_N - 1 - i) / (float)(OUT_N - 1)) , fmul32(im_5[0][OUT_N-1] , (float)(i) / (float)(OUT_N-1))); im_5[OUT_N-1][i] = fadd32 (fmul32(im_5[OUT_N-1][0] , (float)(OUT_N - 1 - i) /(float) (OUT_N-1)) , fmul32(im_5[OUT_N-1][OUT_N-1] , (float)(i) / (float)(OUT_N-1))); } for(int i=1;i<OUT_N-1;i++){ for(int j=0;j<OUT_N;j++){ im_5[i][j] = fadd32 (fmul32(im_5[0][j] , (float)(OUT_N - 1 - i) / (float)(OUT_N - 1)) , fmul32(im_5[OUT_N-1][j] , (float)(i) / (float)(OUT_N - 1))); } } for(int i=0;i<OUT_N;i++){ for(int j=0;j<OUT_N;j++){ printf("%f ",im_5[i][j]); } printf("\n"); } return 0; } /*answer = 0.954780 0.877887 0.800995 0.724102 0.647210 0.921899 0.826679 0.731460 0.636240 0.541020 0.889019 0.775471 0.661924 0.548377 0.434830 0.856138 0.724263 0.592389 0.460514 0.328640 0.823257 0.673055 0.522853 0.372652 0.222450 */

Assembly Code

Assembly code on github main_v2.s.

Verification and analysis

The following image is the result after executing with Ripes.

This is an unoptimized version,it needs 20108 cycles to finish.

Data cache hit rate is 99.46%.

Instr cache hit rate is 90.61%.

Improvement

In order to reduce cycle count and improve efficiency, we can observe from the interpolation process above that each value needs to go through two fmul32 instructions. Therefore, if we can reduce the instructions within fmul32, we can significantly reduce the cycle count.

To achieve the above goal, I will rewrite the portion of the fmul32 instruction that utilizes the inc function.

int32_t er = mshift ? inc(ertmp) : ertmp;

The inc function is primarily used to increment the input number by one.

int32_t er; if(mshift) er = ertmp + 1; else er = ertmp;

After the modification, there is no need to execute the inc function and the mask_lowest_zero function every time when incrementing by one.

Furthermore, in the initial version, the getbit function did not meet the requirements for static inline but was written as a separate function.

getbit: #li a0 , 0x00000011 #li a1 , 1 addi sp , sp , -12 sw s0 , 0(sp) sw s1 , 4(sp) sw ra , 8(sp) mv s0 , a0 mv s1 , a1 sra t0 , s0 , s1 andi a0 , t0 , 1 lw s0 , 0(sp) lw s1 , 4(sp) lw ra , 8(sp) addi sp , sp , 12 ret

Simply need to inline the content of the getbit function at the call sites, which can reduce the extra cycles generated by function calls.

mv a0 , s4 mv a1 , s5 call imul32 mv s8 , a0 #inline getbit li t0 , 24 sra t0 , s8 , t0 andi s9 , t0 , 1

The improved version is main_v2.s , the cycles is 18595.

Data cache hit rate is 99.36%.

Instr cache hit rate is 91.53%.

Analysis

Generated from Ripes

0: 4300006f jal x0 1072 <main> 00000004 <count_leading_zeros>: 4: ff810113 addi x2 x2 -8 8: 00812023 sw x8 0 x2 c: 00112223 sw x1 4 x2 10: 00050413 addi x8 x10 0 14: 00145293 srli x5 x8 1 18: 00546433 or x8 x8 x5 1c: 00245293 srli x5 x8 2 20: 00546433 or x8 x8 x5 24: 00445293 srli x5 x8 4 28: 00546433 or x8 x8 x5 2c: 00845293 srli x5 x8 8 30: 00546433 or x8 x8 x5 34: 01045293 srli x5 x8 16 38: 00546433 or x8 x8 x5 3c: 00145293 srli x5 x8 1 40: 55555337 lui x6 0x55555 44: 55530313 addi x6 x6 1365 48: 0062f2b3 and x5 x5 x6 4c: 40540433 sub x8 x8 x5 50: 33333337 lui x6 0x33333 54: 33330313 addi x6 x6 819 58: 006473b3 and x7 x8 x6 5c: 00245293 srli x5 x8 2 60: 33333337 lui x6 0x33333 64: 33330313 addi x6 x6 819 68: 0062f2b3 and x5 x5 x6 6c: 00728433 add x8 x5 x7 70: 00445293 srli x5 x8 4 74: 008282b3 add x5 x5 x8 78: 0f0f1337 lui x6 0xf0f1 7c: f0f30313 addi x6 x6 -241 80: 0062f433 and x8 x5 x6 84: 00845293 srli x5 x8 8 88: 00540433 add x8 x8 x5 8c: 01045293 srli x5 x8 16 90: 00540433 add x8 x8 x5 94: 02000293 addi x5 x0 32 98: 07f47313 andi x6 x8 127 9c: 40628533 sub x10 x5 x6 a0: 00012403 lw x8 0 x2 a4: 00412083 lw x1 4 x2 a8: 00810113 addi x2 x2 8 ac: 00008067 jalr x0 x1 0 000000b0 <imul32>: b0: ff010113 addi x2 x2 -16 b4: 00812023 sw x8 0 x2 b8: 00912223 sw x9 4 x2 bc: 01212423 sw x18 8 x2 c0: 00112623 sw x1 12 x2 c4: 00050413 addi x8 x10 0 c8: 00058493 addi x9 x11 0 cc: 00000913 addi x18 x0 0 000000d0 <while_imul32>: d0: 0014f293 andi x5 x9 1 d4: 00028463 beq x5 x0 8 <exitif> d8: 00890933 add x18 x18 x8 000000dc <exitif>: dc: 0014d493 srli x9 x9 1 e0: 00048663 beq x9 x0 12 <done_imul32> e4: 00195913 srli x18 x18 1 e8: fe9ff06f jal x0 -24 <while_imul32> 000000ec <done_imul32>: ec: 00090513 addi x10 x18 0 f0: 00012403 lw x8 0 x2 f4: 00412483 lw x9 4 x2 f8: 00812903 lw x18 8 x2 fc: 00c12083 lw x1 12 x2 100: 01010113 addi x2 x2 16 104: 00008067 jalr x0 x1 0 00000108 <fmul32>: 108: fcc10113 addi x2 x2 -52 10c: 00812023 sw x8 0 x2 110: 00912223 sw x9 4 x2 114: 01212423 sw x18 8 x2 118: 01312623 sw x19 12 x2 11c: 01412823 sw x20 16 x2 120: 01512a23 sw x21 20 x2 124: 01612c23 sw x22 24 x2 128: 01712e23 sw x23 28 x2 12c: 03812023 sw x24 32 x2 130: 03912223 sw x25 36 x2 134: 03a12423 sw x26 40 x2 138: 03b12623 sw x27 44 x2 13c: 02112823 sw x1 48 x2 140: 00050413 addi x8 x10 0 144: 00058493 addi x9 x11 0 148: 01f00293 addi x5 x0 31 14c: 00545933 srl x18 x8 x5 150: 0054d9b3 srl x19 x9 x5 154: 008002b7 lui x5 0x800 158: fff28293 addi x5 x5 -1 15c: 005472b3 and x5 x8 x5 160: 00800337 lui x6 0x800 164: 0062ea33 or x20 x5 x6 168: 008002b7 lui x5 0x800 16c: fff28293 addi x5 x5 -1 170: 0054f2b3 and x5 x9 x5 174: 00800337 lui x6 0x800 178: 0062eab3 or x21 x5 x6 17c: 01700293 addi x5 x0 23 180: 005452b3 srl x5 x8 x5 184: 0ff00313 addi x6 x0 255 188: 0062fb33 and x22 x5 x6 18c: 01700293 addi x5 x0 23 190: 0054d2b3 srl x5 x9 x5 194: 0ff00313 addi x6 x0 255 198: 0062fbb3 and x23 x5 x6 19c: 000a0513 addi x10 x20 0 1a0: 000a8593 addi x11 x21 0 1a4: 00000097 auipc x1 0x0 <start> 1a8: f0c080e7 jalr x1 x1 -244 1ac: 00050c13 addi x24 x10 0 1b0: 01800293 addi x5 x0 24 1b4: 405c52b3 sra x5 x24 x5 1b8: 0012fc93 andi x25 x5 1 1bc: 019c5d33 srl x26 x24 x25 1c0: 017b02b3 add x5 x22 x23 1c4: 07f00313 addi x6 x0 127 1c8: 40628c33 sub x24 x5 x6 1cc: 01900663 beq x0 x25 12 <no_inc_ertmp> 1d0: 001c0d93 addi x27 x24 1 1d4: 0080006f jal x0 8 <fmul32_exitifelse> 000001d8 <no_inc_ertmp>: 1d8: 000c0d93 addi x27 x24 0 000001dc <fmul32_exitifelse>: 1dc: 01394cb3 xor x25 x18 x19 1e0: 01f00293 addi x5 x0 31 1e4: 005c92b3 sll x5 x25 x5 1e8: 0ff00313 addi x6 x0 255 1ec: 006df333 and x6 x27 x6 1f0: 01700393 addi x7 x0 23 1f4: 00731333 sll x6 x6 x7 1f8: 008003b7 lui x7 0x800 1fc: fff38393 addi x7 x7 -1 200: 007d73b3 and x7 x26 x7 204: 0062e2b3 or x5 x5 x6 208: 0072e533 or x10 x5 x7 20c: 00012403 lw x8 0 x2 210: 00412483 lw x9 4 x2 214: 00812903 lw x18 8 x2 218: 00c12983 lw x19 12 x2 21c: 01012a03 lw x20 16 x2 220: 01412a83 lw x21 20 x2 224: 01812b03 lw x22 24 x2 228: 01c12b83 lw x23 28 x2 22c: 02012c03 lw x24 32 x2 230: 02412c83 lw x25 36 x2 234: 02812d03 lw x26 40 x2 238: 02c12d83 lw x27 44 x2 23c: 03012083 lw x1 48 x2 240: 03410113 addi x2 x2 52 244: 00008067 jalr x0 x1 0 00000248 <fadd32>: 248: fcc10113 addi x2 x2 -52 24c: 00812023 sw x8 0 x2 250: 00912223 sw x9 4 x2 254: 01212423 sw x18 8 x2 258: 01312623 sw x19 12 x2 25c: 01412823 sw x20 16 x2 260: 01512a23 sw x21 20 x2 264: 01612c23 sw x22 24 x2 268: 01712e23 sw x23 28 x2 26c: 03812023 sw x24 32 x2 270: 03912223 sw x25 36 x2 274: 03a12423 sw x26 40 x2 278: 03b12623 sw x27 44 x2 27c: 02112823 sw x1 48 x2 280: 00050413 addi x8 x10 0 284: 00058493 addi x9 x11 0 288: 800002b7 lui x5 0x80000 28c: fff28293 addi x5 x5 -1 290: 00547333 and x6 x8 x5 294: 0054f3b3 and x7 x9 x5 298: 0063c863 blt x7 x6 16 <noswap> 29c: 00040293 addi x5 x8 0 2a0: 00048413 addi x8 x9 0 2a4: 00028493 addi x9 x5 0 000002a8 <noswap>: 2a8: 01f00293 addi x5 x0 31 2ac: 00545933 srl x18 x8 x5 2b0: 0054d9b3 srl x19 x9 x5 2b4: 008002b7 lui x5 0x800 2b8: fff28293 addi x5 x5 -1 2bc: 00800337 lui x6 0x800 2c0: 005473b3 and x7 x8 x5 2c4: 0063ea33 or x20 x7 x6 2c8: 0054f3b3 and x7 x9 x5 2cc: 0063eab3 or x21 x7 x6 2d0: 01700293 addi x5 x0 23 2d4: 0ff00313 addi x6 x0 255 2d8: 005453b3 srl x7 x8 x5 2dc: 0063fb33 and x22 x7 x6 2e0: 0054d3b3 srl x7 x9 x5 2e4: 0063fbb3 and x23 x7 x6 2e8: 417b02b3 sub x5 x22 x23 2ec: 01800313 addi x6 x0 24 2f0: 00534663 blt x6 x5 12 <setalign_1> 2f4: 00028c13 addi x24 x5 0 2f8: 0080006f jal x0 8 <setalign_exit> 000002fc <setalign_1>: 2fc: 00030c13 addi x24 x6 0 00000300 <setalign_exit>: 300: 018adab3 srl x21 x21 x24 304: 013962b3 or x5 x18 x19 308: 00029663 bne x5 x0 12 <setma_1> 30c: 015a0a33 add x20 x20 x21 310: 0080006f jal x0 8 <setma_exit> 00000314 <setma_1>: 314: 415a0a33 sub x20 x20 x21 00000318 <setma_exit>: 318: 000a0513 addi x10 x20 0 31c: 00000097 auipc x1 0x0 <start> 320: ce8080e7 jalr x1 x1 -792 324: 00050c93 addi x25 x10 0 328: 00000d13 addi x26 x0 0 32c: 00800293 addi x5 x0 8 330: 0192cc63 blt x5 x25 24 <shift_false> 334: 00800293 addi x5 x0 8 338: 41928d33 sub x26 x5 x25 33c: 01aa5a33 srl x20 x20 x26 340: 01ab0b33 add x22 x22 x26 344: 0140006f jal x0 20 <shift_exit> 00000348 <shift_false>: 348: 00800293 addi x5 x0 8 34c: 405c8d33 sub x26 x25 x5 350: 01aa1a33 sll x20 x20 x26 354: 41ab0b33 sub x22 x22 x26 00000358 <shift_exit>: 358: 800002b7 lui x5 0x80000 35c: 005472b3 and x5 x8 x5 360: 01700313 addi x6 x0 23 364: 006b1333 sll x6 x22 x6 368: 008003b7 lui x7 0x800 36c: fff38393 addi x7 x7 -1 370: 007a73b3 and x7 x20 x7 374: 0062e2b3 or x5 x5 x6 378: 0072e533 or x10 x5 x7 37c: 00012403 lw x8 0 x2 380: 00412483 lw x9 4 x2 384: 00812903 lw x18 8 x2 388: 00c12983 lw x19 12 x2 38c: 01012a03 lw x20 16 x2 390: 01412a83 lw x21 20 x2 394: 01812b03 lw x22 24 x2 398: 01c12b83 lw x23 28 x2 39c: 02012c03 lw x24 32 x2 3a0: 02412c83 lw x25 36 x2 3a4: 02812d03 lw x26 40 x2 3a8: 02c12d83 lw x27 44 x2 3ac: 03012083 lw x1 48 x2 3b0: 03410113 addi x2 x2 52 3b4: 00008067 jalr x0 x1 0 000003b8 <print_image>: 3b8: ff410113 addi x2 x2 -12 3bc: 00812023 sw x8 0 x2 3c0: 00912223 sw x9 4 x2 3c4: 01212423 sw x18 8 x2 3c8: 00000413 addi x8 x0 0 3cc: 00000493 addi x9 x0 0 3d0: 00500913 addi x18 x0 5 000003d4 <p_outer_loop>: 3d4: 05245463 bge x8 x18 72 <p_done> 3d8: 00000493 addi x9 x0 0 000003dc <p_inner_loop>: 3dc: 0324d463 bge x9 x18 40 <p_inner_done> 3e0: 0005a503 lw x10 0 x11 3e4: 00200893 addi x17 x0 2 3e8: 00000073 ecall 3ec: 02000513 addi x10 x0 32 3f0: 00b00893 addi x17 x0 11 3f4: 00000073 ecall 3f8: 00458593 addi x11 x11 4 3fc: 00148493 addi x9 x9 1 400: fddff06f jal x0 -36 <p_inner_loop> 00000404 <p_inner_done>: 404: 10000517 auipc x10 0x10000 408: bfc50513 addi x10 x10 -1028 40c: 00400893 addi x17 x0 4 410: 00000073 ecall 414: 00140413 addi x8 x8 1 418: fbdff06f jal x0 -68 <p_outer_loop> 0000041c <p_done>: 41c: 00012403 lw x8 0 x2 420: 00412483 lw x9 4 x2 424: 00812903 lw x18 8 x2 428: 00c10113 addi x2 x2 12 42c: 00008067 jalr x0 x1 0 00000430 <main>: 430: 10000517 auipc x10 0x10000 434: bd250513 addi x10 x10 -1070 438: 00052a03 lw x20 0 x10 43c: 00452a83 lw x21 4 x10 440: 10000517 auipc x10 0x10000 444: bd650513 addi x10 x10 -1066 448: 00052283 lw x5 0 x10 44c: 00452303 lw x6 4 x10 450: 00852383 lw x7 8 x10 454: 00c52e03 lw x28 12 x10 458: 10000b17 auipc x22 0x10000 45c: bceb0b13 addi x22 x22 -1074 460: 005b2023 sw x5 0 x22 464: 006b2823 sw x6 16 x22 468: 047b2823 sw x7 80 x22 46c: 07cb2023 sw x28 96 x22 470: 00100413 addi x8 x0 1 474: 00400493 addi x9 x0 4 00000478 <first_loop>: 478: 000b2503 lw x10 0 x22 47c: 10000f17 auipc x30 0x10000 480: b8ef0f13 addi x30 x30 -1138 484: 00300313 addi x6 x0 3 488: 40830333 sub x6 x6 x8 48c: 00231313 slli x6 x6 2 490: 006f0333 add x6 x30 x6 494: 00032583 lw x11 0 x6 498: 00000097 auipc x1 0x0 <start> 49c: c70080e7 jalr x1 x1 -912 4a0: 00050d93 addi x27 x10 0 4a4: 010b2503 lw x10 16 x22 4a8: 10000f17 auipc x30 0x10000 4ac: b62f0f13 addi x30 x30 -1182 4b0: fff40313 addi x6 x8 -1 4b4: 00231313 slli x6 x6 2 4b8: 006f0333 add x6 x30 x6 4bc: 00032583 lw x11 0 x6 4c0: 00000097 auipc x1 0x0 <start> 4c4: c48080e7 jalr x1 x1 -952 4c8: 000d8593 addi x11 x27 0 4cc: 00000097 auipc x1 0x0 <start> 4d0: d7c080e7 jalr x1 x1 -644 4d4: 00241313 slli x6 x8 2 4d8: 01630333 add x6 x6 x22 4dc: 00a32023 sw x10 0 x6 4e0: 050b2503 lw x10 80 x22 4e4: 10000f17 auipc x30 0x10000 4e8: b26f0f13 addi x30 x30 -1242 4ec: 00300313 addi x6 x0 3 4f0: 40830333 sub x6 x6 x8 4f4: 00231313 slli x6 x6 2 4f8: 006f0333 add x6 x30 x6 4fc: 00032583 lw x11 0 x6 500: 00000097 auipc x1 0x0 <start> 504: c08080e7 jalr x1 x1 -1016 508: 00050d93 addi x27 x10 0 50c: 060b2503 lw x10 96 x22 510: 10000f17 auipc x30 0x10000 514: afaf0f13 addi x30 x30 -1286 518: fff40313 addi x6 x8 -1 51c: 00231313 slli x6 x6 2 520: 006f0333 add x6 x30 x6 524: 00032583 lw x11 0 x6 528: 00000097 auipc x1 0x0 <start> 52c: be0080e7 jalr x1 x1 -1056 530: 000d8593 addi x11 x27 0 534: 00000097 auipc x1 0x0 <start> 538: d14080e7 jalr x1 x1 -748 53c: 00241313 slli x6 x8 2 540: 01630333 add x6 x6 x22 544: 04a32823 sw x10 80 x6 548: 00140413 addi x8 x8 1 54c: f29446e3 blt x8 x9 -212 <first_loop> 550: 00100413 addi x8 x0 1 554: 00400913 addi x18 x0 4 558: 00500993 addi x19 x0 5 0000055c <second_outloop>: 55c: 00000493 addi x9 x0 0 00000560 <second_inloop>: 560: 00249293 slli x5 x9 2 564: 016282b3 add x5 x5 x22 568: 0002a503 lw x10 0 x5 56c: 10000f17 auipc x30 0x10000 570: a9ef0f13 addi x30 x30 -1378 574: 00300313 addi x6 x0 3 578: 40830333 sub x6 x6 x8 57c: 00231313 slli x6 x6 2 580: 006f0333 add x6 x30 x6 584: 00032583 lw x11 0 x6 588: 00000097 auipc x1 0x0 <start> 58c: b80080e7 jalr x1 x1 -1152 590: 00050d93 addi x27 x10 0 594: 00249293 slli x5 x9 2 598: 016282b3 add x5 x5 x22 59c: 0502a503 lw x10 80 x5 5a0: 10000f17 auipc x30 0x10000 5a4: a6af0f13 addi x30 x30 -1430 5a8: fff40313 addi x6 x8 -1 5ac: 00231313 slli x6 x6 2 5b0: 006f0333 add x6 x30 x6 5b4: 00032583 lw x11 0 x6 5b8: 00000097 auipc x1 0x0 <start> 5bc: b50080e7 jalr x1 x1 -1200 5c0: 000d8593 addi x11 x27 0 5c4: 00000097 auipc x1 0x0 <start> 5c8: c84080e7 jalr x1 x1 -892 5cc: 00241313 slli x6 x8 2 5d0: 00830333 add x6 x6 x8 5d4: 00930333 add x6 x6 x9 5d8: 00231313 slli x6 x6 2 5dc: 01630333 add x6 x6 x22 5e0: 00a32023 sw x10 0 x6 5e4: 00148493 addi x9 x9 1 5e8: f734cce3 blt x9 x19 -136 <second_inloop> 5ec: 00140413 addi x8 x8 1 5f0: f72446e3 blt x8 x18 -148 <second_outloop> 5f4: 10000597 auipc x11 0x10000 5f8: a3258593 addi x11 x11 -1486 5fc: 00000097 auipc x1 0x0 <start> 600: dbc080e7 jalr x1 x1 -580 604: 00a00893 addi x17 x0 10 608: 00000073 ecall

IF stage

  • We start from instruction put at 0x000000488, so addr is equal to 0x00000488
  • The machine code of first instruction is 0x40830333 (sub x6 x6 x8), so instr is equal to 0x40830333.
  • Next instruction will be at PC + 4 (0x0000048C) because there is no branch occur.

ID stage

  • Instruction 0x40830333 is decoded to three part:
  • opcode = sub
  • Wr idx = 0x06
  • R1 idx = 0x06
  • R2 idx = 0x08
  • Reg 1 = 0x3F25AF8E from 0x06 Register
  • Reg 2 = 0x00000001 from 0x08 Register
  • In R-type , imm. don't care
  • Current PC value (0x000000488) and next PC value (0x00000048C) are just send through this stage, we don’t use them.

EXE stage

  • Op1 = 0x00000003 from branch because there is a data dependency between current instr (sub x6 x6 x8) and previous instr(addi x6 x0 3).
  • Op2 = 0x00000001 from previous stage Reg 2 Register.
  • Res = 0x00000002 (Op1 - Op2) because opcode is sub.

MEM stage

R-type instr don't need to read or write memory , pass Res(0x00000002) through this stage and go to WB stage

WB stage

  • The multiplexer choose Res from ALU as final output. So the output value is 0x00000002.
  • The output value and Wr idx are send back to registers block. Finally, the value 0x00000002 will be write into x6 register, whose ABI name is t1.

After all these stage are done, the register is updated like this:

References

Bilinear interpolation - Wikipedia
Detailed Explanation of Bilinear Interpolation for Image Scaling