# Assignment1: Shell sort with FP32 in BF16 format contributed by < [`TRChen11011`](https://github.com/TRChen11011/Shell-sort-with-FP32-in-BF16-format) > ## Shell sort with Floating number Target to sort float number list with shell sort, and using bf16 format to sort instead of using normal calculate. **For Example**. float number list `1.6, -1.5, 1.4, -1.3, 1.2, -1.1`. list length is 6,so the gap will be 3. after the first round, gap will divide 2 = 1 until gap = 0. gap = 3 : final result : `-1.3, -1.5, -1.1, 1.6, 1.2, 1.4`. gap = 1 : final result : `-1.5, -1.3, -1.1, 1.2, 1.4, 1.6`. ## Implement ### C code ```c #include <stdio.h> #include <stdbool.h> #define array_size 6 unsigned int fp32_to_bf16(float x) { float y = x; int *p = (int *) &y; unsigned int exp = *p & 0x7F800000; unsigned int man = *p & 0x007FFFFF; if (exp == 0 && man == 0) /* zero */ return *p; if (exp == 0x7F800000) /* infinity or NaN */ return *p; float r = x; int *pr = (int *) &r; *pr &= 0xFF800000; /* r has the same exp as x */ r /= 0x100; y = x + r; *p &= 0xFFFF0000; return *p; } bool BOS(float fp32_1, float fp32_2){ unsigned int bf16_1, bf16_2, sig1, sig2, exp1, exp2, man1, man2; /* normalize fp32 to bf16 */ bf16_1 = fp32_to_bf16(fp32_1); bf16_2 = fp32_to_bf16(fp32_2); sig1 = bf16_1 & 0x80000000;/* To get temp and array[j-interval]'s sig, exp, man'*/ sig2 = bf16_2 & 0x80000000; exp1 = bf16_1 & 0x7F800000; exp2 = bf16_2 & 0x7F800000; man1 = bf16_1 & 0x007F0000; man2 = bf16_2 & 0x007F0000; if (sig1 < sig2) return 1; /* Different sign */ else if (sig1 == 0 && sig1 == sig2 && exp1 > exp2) return 1; /* positive sign but different exp*/ else if (sig1 != 0 && sig1 == sig2 && exp1 < exp2) return 1; /* negative sign but different exp*/ else if (sig1 == 0 && sig1 == sig2 && exp1 == exp2 && man1 > man2) return 1; /* posetive sign same exp but different man*/ else if (sig1 != 0 && sig1 == sig2 && exp1 == exp2 && man1 < man2) return 1; /* negative sign same exp but different man*/ else return 0; } void ShellSort(float array[array_size]){ int interval = array_size / 2; /* gap /= 2 */ while (interval >0){ for(int i = interval;i<array_size;i++){ int j = i; float temp = array[i]; /* Flag ? temp < array[j-interval]*/ int Flag = BOS(array[j-interval], temp); while (j>= interval && Flag == 1){ array[j] = array[j-interval]; j = j-interval; Flag = BOS(array[j-interval], temp); } array[j] = temp; } interval = interval / 2; } } void main() { float array[array_size] = {1.6,-1.5,1.4,-1.3,1.2,-1.1}; ShellSort(array); for (int i = 0;i<array_size;i++){ printf("%f\n",array[i]); } return 0; } ``` #### result compile with [Online C Compiler](https://www.programiz.com/c-programming/online-compiler/) ![](https://hackmd.io/_uploads/S1-4LWhep.png) ### Assembly code #### version 1.2 without using BF16 format ```asm! .data dataset1: .word 0x3fcccccd #1.6 dataset2: .word 0xbfc00000 #-1.5 dataset3: .word 0x3fb33333 #1.4 dataset4: .word 0xbfa66666 #-1.3 dataset5: .word 0x3f99999a #1.2 dataset6: .word 0xbf8ccccd #-1.1 array_size: .word 0x00000006 _EOL: .string "\n" .text main: # float array[array_size] = {1.6,-1.5,1.4,-1.3,1.2,-1.1}; addi, sp,sp,-20 lw, s0,dataset1 lw, s1,dataset2 lw, s2,dataset3 lw, s3,dataset4 lw, s4,dataset5 lw, s5,dataset6 sw, s0,0(sp) sw, s1,4(sp) sw, s2,8(sp) sw, s3,12(sp) sw, s4,16(sp) sw, s5,20(sp) lw, s11,array_size jal, ShellSort ShellSort: lw, t0,array_size srli, t0,t0,1 # int interval = array_size / 2; Whileinterval: beq, t0,zero,Print # while (interval >0) add, t1,zero,t0 # i = interval Foriarraysize: beq, t1,s11,Interval_div2 # i<interval add, t2,zero,t1 # j = i slli, s0,t1,2 # temp = array[i] ,t4 = temp add, s1,sp,s0 lw, t4,0(s1) ReadData_Temp_done: sub, t3,t2,t0 # j = j - interval slli, s0,t3,2 # array[j-interval] ,t5 = j-interval add, s1,sp,s0 lw, t5,0(s1) jal, BOS # Flag = BOS(array[j-interval], temp); WhilejandFlag: beq, s10,zero,arrayjtotemp bltu, t2,t0,arrayjtotemp # while (j>= interval && Flag == 1) slli, s0,t2,2 # array[j] = array[j-interval] add, s1,sp,s0 sw, t5,0(s1) sub,t2,t2,t0 jal, ReadData_Temp_done arrayjtotemp: slli, s0,t2,2 # array[j] = temp add, s1,sp,s0 sw, t4,0(s1) addi, t1,t1,1 # i++ jal, Foriarraysize Interval_div2: srli, t0,t0,1 jal, Whileinterval #-------------------------------------------------- BOS: xor, t6,t4,t5 srli, t6,t6,31 beq, t6,zero,Same blt, t5,t4,Big addi, s10,zero,1 jal, WhilejandFlag Big: add, s10,zero,zero jal, WhilejandFlag Same: srli, t6,t4,31 beq, t6,zero,pos bltu, t5,t4,negBig add, s10,zero,zero jal, WhilejandFlag negBig: addi, s10,zero,1 jal, WhilejandFlag pos: blt, t5,t4,posBig addi, s10,zero,1 jal, WhilejandFlag posBig: add, s10,zero,zero jal, WhilejandFlag #-------------------------------------------------- Print: slli, s0,t6,2 add, s1,sp,s0 lw, a0,0(s1) li, a7,2 ecall addi, t6,t6,1 la, a0,_EOL li, a7,4 ecall beq, t6,s11,End jal, Print End: addi, sp,sp,20 li, a7,10 ecall ``` ##### output ![](https://hackmd.io/_uploads/H11QU72la.png)![](https://hackmd.io/_uploads/r1vrUX3xa.png) #### version 1.3 with using BF16 format (final) ```asm! .data dataset1: .word 0x3fcccccd #1.6 dataset2: .word 0xbfc00000 #-1.5 dataset3: .word 0x3fb33333 #1.4 dataset4: .word 0xbfa66666 #-1.3 dataset5: .word 0x3f99999a #1.2 dataset6: .word 0xbf8ccccd #-1.1 array_size: .word 0x00000006 sign: .word 0x80000000 exp: .word 0x7F800000 man: .word 0x007FFFFF bf16man: .word 0x007F0000 bf16format: .word 0xFFFF0000 _EOL: .string "\n" .text main: # float array[array_size] = {1.6,-1.5,1.4,-1.3,1.2,-1.1}; addi, sp,sp,-20 lw, s0,dataset1 #1.6 lw, s1,dataset2 #-1.5 lw, s2,dataset3 #1.4 lw, s3,dataset4 #-1.3 lw, s4,dataset5 #1.2 lw, s5,dataset6 #-1.1 lw, a0,sign lw, a1,exp lw, a2,man lw, a3,bf16man lw, a4,bf16format sw, s0,0(sp) sw, s1,4(sp) sw, s2,8(sp) sw, s3,12(sp) sw, s4,16(sp) sw, s5,20(sp) lw, s11,array_size jal, ShellSort ShellSort: lw, t0,array_size srli, t0,t0,1 # int interval = array_size / 2; Whileinterval: beq, t0,zero,Print # while (interval >0) add, t1,zero,t0 # i = interval Foriarraysize: beq, t1,s11,Interval_div2 # i<interval add, t2,zero,t1 # j = i slli, s0,t1,2 # temp = array[i] t5 = temp add, s1,sp,s0 lw, t5,0(s1) ReadData_Temp_done: sub, t3,t2,t0 # j = j - interval slli, s0,t3,2 # array[j-interval], t4 = j-interval add, s1,sp,s0 lw, t4,0(s1) jal, fp32_to_bf16 # Flag = BOS(array[j-interval], temp); WhilejandFlag: beq, s10,zero,arrayjtotemp bltu, t2,t0,arrayjtotemp # while (j>= interval && Flag == 1) and, t4,t4,a4 # bf16format slli, s0,t2,2 # array[j] = array[j-interval] add, s1,sp,s0 sw, t4,0(s1) sub,t2,t2,t0 jal, ReadData_Temp_done arrayjtotemp: and, t5,t5,a4 # bf16format slli, s0,t2,2 # array[j] = temp add, s1,sp,s0 sw, t5,0(s1) addi, t1,t1,1 # i++ jal, Foriarraysize Interval_div2: srli, t0,t0,1 jal, Whileinterval #-------------------------------------------------- BOS: blt, s0,s1,SMALL beq, s0,s1,Sigsame jal, BIG # if (sig1 < sig2) Sigsame: beq, s2,s3,Expsame beq, s0,zero,Exp1 # if (sig1 == 0 && sig1 == sig2) jal, Exp2 # if (sig1 != 0 && sig1 == sig2) Expsame: beq, s0,zero,Man1 # if (sig1 == 0 && sig1 == sig2 && exp1 == exp2) jal, Man2 Exp1: blt, s2,s3,BIG # exp1 > exp2 jal, SMALL Exp2: blt, s2,s3,SMALL # exp1 < exp2 jal, BIG Man1: blt, s6,s7,SMALL # man1 > man2 jal, BIG Man2: blt, s6,s7,BIG # man1 < man2 jal, SMALL BIG: # return 1; addi, s10,zero,1 jal, WhilejandFlag SMALL: # return 0 addi, s10,zero,0 jal, WhilejandFlag # sig1 = bf16_1 & 0x80000000; # sig2 = bf16_2 & 0x80000000; # exp1 = bf16_1 & 0x7F800000; # exp2 = bf16_2 & 0x7F800000; # man1 = bf16_1 & 0x007F0000; # man2 = bf16_2 & 0x007F0000; # take two number's sig, exp, man and bf16man fp32_to_bf16: and s0,a0,t4 and s1,a0,t5 # sig and s2,a1,t4 and s3,a1,t5 # exp and s4,a2,t4 and s5,a2,t5 # man and s6,a3,t4 and s7,a3,t5 # bf16man jal BOS #-------------------------------------------------- Print: slli, s0,t6,2 add, s1,sp,s0 lw, a0,0(s1) li, a7,2 ecall addi, t6,t6,1 la, a0,_EOL li, a7,4 ecall beq, t6,s11,End jal, Print End: addi, sp,sp,20 li, a7,10 ecall ``` ##### output ![](https://hackmd.io/_uploads/B1LCdXhep.png)![](https://hackmd.io/_uploads/SJt1F73e6.png) ## Analysis & Pipeline generated from Ripes ``` 00000000 <main>: 0: fec10113 addi x2 x2 -20 4: 10000417 auipc x8 0x10000 8: ffc42403 lw x8 -4 x8 c: 10000497 auipc x9 0x10000 10: ff84a483 lw x9 -8 x9 14: 10000917 auipc x18 0x10000 18: ff492903 lw x18 -12 x18 1c: 10000997 auipc x19 0x10000 20: ff09a983 lw x19 -16 x19 24: 10000a17 auipc x20 0x10000 28: feca2a03 lw x20 -20 x20 2c: 10000a97 auipc x21 0x10000 30: fe8aaa83 lw x21 -24 x21 34: 10000517 auipc x10 0x10000 38: fe852503 lw x10 -24 x10 3c: 10000597 auipc x11 0x10000 40: fe45a583 lw x11 -28 x11 44: 10000617 auipc x12 0x10000 48: fe062603 lw x12 -32 x12 4c: 10000697 auipc x13 0x10000 50: fdc6a683 lw x13 -36 x13 54: 10000717 auipc x14 0x10000 58: fd872703 lw x14 -40 x14 5c: 00812023 sw x8 0 x2 60: 00912223 sw x9 4 x2 64: 01212423 sw x18 8 x2 68: 01312623 sw x19 12 x2 6c: 01412823 sw x20 16 x2 70: 01512a23 sw x21 20 x2 74: 10000d97 auipc x27 0x10000 78: fa4dad83 lw x27 -92 x27 7c: 004000ef jal x1 4 <ShellSort> 00000080 <ShellSort>: 80: 10000297 auipc x5 0x10000 84: f982a283 lw x5 -104 x5 88: 0012d293 srli x5 x5 1 0000008c <Whileinterval>: 8c: 0e028263 beq x5 x0 228 <Print> 90: 00500333 add x6 x0 x5 00000094 <Foriarraysize>: 94: 07b30063 beq x6 x27 96 <Interval_div2> 98: 006003b3 add x7 x0 x6 9c: 00231413 slli x8 x6 2 a0: 008104b3 add x9 x2 x8 a4: 0004af03 lw x30 0 x9 000000a8 <ReadData_Temp_done>: a8: 40538e33 sub x28 x7 x5 ac: 002e1413 slli x8 x28 2 b0: 008104b3 add x9 x2 x8 b4: 0004ae83 lw x29 0 x9 b8: 094000ef jal x1 148 <fp32_to_bf16> 000000bc <WhilejandFlag>: bc: 020d0063 beq x26 x0 32 <arrayjtotemp> c0: 0053ee63 bltu x7 x5 28 <arrayjtotemp> c4: 00eefeb3 and x29 x29 x14 c8: 00239413 slli x8 x7 2 cc: 008104b3 add x9 x2 x8 d0: 01d4a023 sw x29 0 x9 d4: 405383b3 sub x7 x7 x5 d8: fd1ff0ef jal x1 -48 <ReadData_Temp_done> 000000dc <arrayjtotemp>: dc: 00ef7f33 and x30 x30 x14 e0: 00239413 slli x8 x7 2 e4: 008104b3 add x9 x2 x8 e8: 01e4a023 sw x30 0 x9 ec: 00130313 addi x6 x6 1 f0: fa5ff0ef jal x1 -92 <Foriarraysize> 000000f4 <Interval_div2>: f4: 0012d293 srli x5 x5 1 f8: f95ff0ef jal x1 -108 <Whileinterval> 000000fc <BOS>: fc: 04944463 blt x8 x9 72 <SMALL> 100: 00940463 beq x8 x9 8 <Sigsame> 104: 038000ef jal x1 56 <BIG> 00000108 <Sigsame>: 108: 01390663 beq x18 x19 12 <Expsame> 10c: 00040863 beq x8 x0 16 <Exp1> 110: 014000ef jal x1 20 <Exp2> 00000114 <Expsame>: 114: 00040c63 beq x8 x0 24 <Man1> 118: 01c000ef jal x1 28 <Man2> 0000011c <Exp1>: 11c: 03394063 blt x18 x19 32 <BIG> 120: 024000ef jal x1 36 <SMALL> 00000124 <Exp2>: 124: 03394063 blt x18 x19 32 <SMALL> 128: 014000ef jal x1 20 <BIG> 0000012c <Man1>: 12c: 017b4c63 blt x22 x23 24 <SMALL> 130: 00c000ef jal x1 12 <BIG> 00000134 <Man2>: 134: 017b4463 blt x22 x23 8 <BIG> 138: 00c000ef jal x1 12 <SMALL> 0000013c <BIG>: 13c: 00100d13 addi x26 x0 1 140: f7dff0ef jal x1 -132 <WhilejandFlag> 00000144 <SMALL>: 144: 00000d13 addi x26 x0 0 148: f75ff0ef jal x1 -140 <WhilejandFlag> 0000014c <fp32_to_bf16>: 14c: 01d57433 and x8 x10 x29 150: 01e574b3 and x9 x10 x30 154: 01d5f933 and x18 x11 x29 158: 01e5f9b3 and x19 x11 x30 15c: 01d67a33 and x20 x12 x29 160: 01e67ab3 and x21 x12 x30 164: 01d6fb33 and x22 x13 x29 168: 01e6fbb3 and x23 x13 x30 16c: f91ff0ef jal x1 -112 <BOS> 00000170 <Print>: 170: 002f9413 slli x8 x31 2 174: 008104b3 add x9 x2 x8 178: 0004a503 lw x10 0 x9 17c: 00200893 addi x17 x0 2 180: 00000073 ecall 184: 001f8f93 addi x31 x31 1 188: 10000517 auipc x10 0x10000 18c: ea850513 addi x10 x10 -344 190: 00400893 addi x17 x0 4 194: 00000073 ecall 198: 01bf8463 beq x31 x27 8 <End> 19c: fd5ff0ef jal x1 -44 <Print> 000001a0 <End>: 1a0: 01410113 addi x2 x2 20 1a4: 00a00893 addi x17 x0 10 1a8: 00000073 ecall ``` ### IF stage - From `PC` is `0x0`. We can know the instruction in IF is `0: fec10113 addi x2 x2 -20`, which is mapped to `addi sp,sp,-20` - And `PC` is `0x4`. Because there is no branch happen. - By `IR`, we can get the machine code `0xfec10113` from `Instr. memory` ![](https://hackmd.io/_uploads/rJFuvE3gp.png) - `PC` will be `PC+4` ![](https://hackmd.io/_uploads/SJ7iDNhlT.png) ### ID stage - The `addi x2 x2 -20 (addi sp,sp,-20)` is an I-type instruction. The `OP code` for `addi` is `0010011` , `func3` is `000` and `-20` in binary is `11111111111111111111111111101100` and `0xffffffec` in hex. So the machine is |IMM|rs1|func3|rd|op| |:-:|:-:|:-:|:-:|:-:| |111111101100|00010|000|00010|0010011| And `0xfec10113` in hex. After decoder, the processor get the `OPcode` `0010011` and know that this is `addi` instruction. And destination register is `x2` which is map to sp register. ![](https://hackmd.io/_uploads/r1Zi242eT.png) ### EX stage 1. The `addi x2 x2 -20` would add `x2` and `imm` , which is `0x7fffffff0` and `0xffffffec` and the result is `0x7fffffdc` ![](https://hackmd.io/_uploads/SJCST43lp.png) and upper immediate value `0x10000000`. Then the result is `0x10000000` 2. The `7c: 004000ef jal x1 4 <ShellSort>` set the `PC` to new position ![](https://hackmd.io/_uploads/r1mBCV3eT.png) ### MEM stage 1. The `5c: 00812023 sw x8 0 x2` instruction store data from the register of `x8`, which is `s0` to address `0(x2)` which is `0(sp)`.and data in `x8` is `0x3fcccccd` and address `0(x2)` is `0x7fffffdc` ![](https://hackmd.io/_uploads/Bye_ZB3la.png) and `Read out` is `0x00000000` ### WB stage 1. The `0: fec10113 addi x2 x2 -20` instruction write the EX stage result `0x7fffffdc` to the register `x2`. ![](https://hackmd.io/_uploads/B118MS3lp.png) ![](https://hackmd.io/_uploads/HJSaGBnxa.png)