# Assignment1: Shell sort with FP32 in BF16 format
contributed by < [`TRChen11011`](https://github.com/TRChen11011/Shell-sort-with-FP32-in-BF16-format) >
## Shell sort with Floating number
Target to sort float number list with shell sort, and using bf16 format to sort instead of using normal calculate.
**For Example**.
float number list `1.6, -1.5, 1.4, -1.3, 1.2, -1.1`.
list length is 6,so the gap will be 3.
after the first round, gap will divide 2 = 1 until gap = 0.
gap = 3 : final result : `-1.3, -1.5, -1.1, 1.6, 1.2, 1.4`.
gap = 1 : final result : `-1.5, -1.3, -1.1, 1.2, 1.4, 1.6`.
## Implement
### C code
```c
#include <stdio.h>
#include <stdbool.h>
#define array_size 6
unsigned int fp32_to_bf16(float x)
{
float y = x;
int *p = (int *) &y;
unsigned int exp = *p & 0x7F800000;
unsigned int man = *p & 0x007FFFFF;
if (exp == 0 && man == 0) /* zero */
return *p;
if (exp == 0x7F800000) /* infinity or NaN */
return *p;
float r = x;
int *pr = (int *) &r;
*pr &= 0xFF800000; /* r has the same exp as x */
r /= 0x100;
y = x + r;
*p &= 0xFFFF0000;
return *p;
}
bool BOS(float fp32_1, float fp32_2){
unsigned int bf16_1, bf16_2, sig1, sig2, exp1, exp2, man1, man2;
/* normalize fp32 to bf16 */
bf16_1 = fp32_to_bf16(fp32_1);
bf16_2 = fp32_to_bf16(fp32_2);
sig1 = bf16_1 & 0x80000000;/* To get temp and array[j-interval]'s sig, exp, man'*/
sig2 = bf16_2 & 0x80000000;
exp1 = bf16_1 & 0x7F800000;
exp2 = bf16_2 & 0x7F800000;
man1 = bf16_1 & 0x007F0000;
man2 = bf16_2 & 0x007F0000;
if (sig1 < sig2) return 1; /* Different sign */
else if (sig1 == 0 && sig1 == sig2 && exp1 > exp2)
return 1; /* positive sign but different exp*/
else if (sig1 != 0 && sig1 == sig2 && exp1 < exp2)
return 1; /* negative sign but different exp*/
else if (sig1 == 0 && sig1 == sig2 && exp1 == exp2 && man1 > man2)
return 1; /* posetive sign same exp but different man*/
else if (sig1 != 0 && sig1 == sig2 && exp1 == exp2 && man1 < man2)
return 1; /* negative sign same exp but different man*/
else return 0;
}
void ShellSort(float array[array_size]){
int interval = array_size / 2; /* gap /= 2 */
while (interval >0){
for(int i = interval;i<array_size;i++){
int j = i;
float temp = array[i];
/* Flag ? temp < array[j-interval]*/
int Flag = BOS(array[j-interval], temp);
while (j>= interval && Flag == 1){
array[j] = array[j-interval];
j = j-interval;
Flag = BOS(array[j-interval], temp);
}
array[j] = temp;
}
interval = interval / 2;
}
}
void main()
{
float array[array_size] = {1.6,-1.5,1.4,-1.3,1.2,-1.1};
ShellSort(array);
for (int i = 0;i<array_size;i++){
printf("%f\n",array[i]);
}
return 0;
}
```
#### result compile with [Online C Compiler](https://www.programiz.com/c-programming/online-compiler/)

### Assembly code
#### version 1.2 without using BF16 format
```asm!
.data
dataset1: .word 0x3fcccccd #1.6
dataset2: .word 0xbfc00000 #-1.5
dataset3: .word 0x3fb33333 #1.4
dataset4: .word 0xbfa66666 #-1.3
dataset5: .word 0x3f99999a #1.2
dataset6: .word 0xbf8ccccd #-1.1
array_size: .word 0x00000006
_EOL: .string "\n"
.text
main:
# float array[array_size] = {1.6,-1.5,1.4,-1.3,1.2,-1.1};
addi, sp,sp,-20
lw, s0,dataset1
lw, s1,dataset2
lw, s2,dataset3
lw, s3,dataset4
lw, s4,dataset5
lw, s5,dataset6
sw, s0,0(sp)
sw, s1,4(sp)
sw, s2,8(sp)
sw, s3,12(sp)
sw, s4,16(sp)
sw, s5,20(sp)
lw, s11,array_size
jal, ShellSort
ShellSort:
lw, t0,array_size
srli, t0,t0,1 # int interval = array_size / 2;
Whileinterval:
beq, t0,zero,Print # while (interval >0)
add, t1,zero,t0 # i = interval
Foriarraysize:
beq, t1,s11,Interval_div2 # i<interval
add, t2,zero,t1 # j = i
slli, s0,t1,2 # temp = array[i] ,t4 = temp
add, s1,sp,s0
lw, t4,0(s1)
ReadData_Temp_done:
sub, t3,t2,t0 # j = j - interval
slli, s0,t3,2 # array[j-interval] ,t5 = j-interval
add, s1,sp,s0
lw, t5,0(s1)
jal, BOS # Flag = BOS(array[j-interval], temp);
WhilejandFlag:
beq, s10,zero,arrayjtotemp
bltu, t2,t0,arrayjtotemp # while (j>= interval && Flag == 1)
slli, s0,t2,2 # array[j] = array[j-interval]
add, s1,sp,s0
sw, t5,0(s1)
sub,t2,t2,t0
jal, ReadData_Temp_done
arrayjtotemp:
slli, s0,t2,2 # array[j] = temp
add, s1,sp,s0
sw, t4,0(s1)
addi, t1,t1,1 # i++
jal, Foriarraysize
Interval_div2:
srli, t0,t0,1
jal, Whileinterval
#--------------------------------------------------
BOS:
xor, t6,t4,t5
srli, t6,t6,31
beq, t6,zero,Same
blt, t5,t4,Big
addi, s10,zero,1
jal, WhilejandFlag
Big:
add, s10,zero,zero
jal, WhilejandFlag
Same:
srli, t6,t4,31
beq, t6,zero,pos
bltu, t5,t4,negBig
add, s10,zero,zero
jal, WhilejandFlag
negBig:
addi, s10,zero,1
jal, WhilejandFlag
pos:
blt, t5,t4,posBig
addi, s10,zero,1
jal, WhilejandFlag
posBig:
add, s10,zero,zero
jal, WhilejandFlag
#--------------------------------------------------
Print:
slli, s0,t6,2
add, s1,sp,s0
lw, a0,0(s1)
li, a7,2
ecall
addi, t6,t6,1
la, a0,_EOL
li, a7,4
ecall
beq, t6,s11,End
jal, Print
End:
addi, sp,sp,20
li, a7,10
ecall
```
##### output

#### version 1.3 with using BF16 format (final)
```asm!
.data
dataset1: .word 0x3fcccccd #1.6
dataset2: .word 0xbfc00000 #-1.5
dataset3: .word 0x3fb33333 #1.4
dataset4: .word 0xbfa66666 #-1.3
dataset5: .word 0x3f99999a #1.2
dataset6: .word 0xbf8ccccd #-1.1
array_size: .word 0x00000006
sign: .word 0x80000000
exp: .word 0x7F800000
man: .word 0x007FFFFF
bf16man: .word 0x007F0000
bf16format: .word 0xFFFF0000
_EOL: .string "\n"
.text
main:
# float array[array_size] = {1.6,-1.5,1.4,-1.3,1.2,-1.1};
addi, sp,sp,-20
lw, s0,dataset1 #1.6
lw, s1,dataset2 #-1.5
lw, s2,dataset3 #1.4
lw, s3,dataset4 #-1.3
lw, s4,dataset5 #1.2
lw, s5,dataset6 #-1.1
lw, a0,sign
lw, a1,exp
lw, a2,man
lw, a3,bf16man
lw, a4,bf16format
sw, s0,0(sp)
sw, s1,4(sp)
sw, s2,8(sp)
sw, s3,12(sp)
sw, s4,16(sp)
sw, s5,20(sp)
lw, s11,array_size
jal, ShellSort
ShellSort:
lw, t0,array_size
srli, t0,t0,1 # int interval = array_size / 2;
Whileinterval:
beq, t0,zero,Print # while (interval >0)
add, t1,zero,t0 # i = interval
Foriarraysize:
beq, t1,s11,Interval_div2 # i<interval
add, t2,zero,t1 # j = i
slli, s0,t1,2 # temp = array[i] t5 = temp
add, s1,sp,s0
lw, t5,0(s1)
ReadData_Temp_done:
sub, t3,t2,t0 # j = j - interval
slli, s0,t3,2 # array[j-interval], t4 = j-interval
add, s1,sp,s0
lw, t4,0(s1)
jal, fp32_to_bf16 # Flag = BOS(array[j-interval], temp);
WhilejandFlag:
beq, s10,zero,arrayjtotemp
bltu, t2,t0,arrayjtotemp # while (j>= interval && Flag == 1)
and, t4,t4,a4 # bf16format
slli, s0,t2,2 # array[j] = array[j-interval]
add, s1,sp,s0
sw, t4,0(s1)
sub,t2,t2,t0
jal, ReadData_Temp_done
arrayjtotemp:
and, t5,t5,a4 # bf16format
slli, s0,t2,2 # array[j] = temp
add, s1,sp,s0
sw, t5,0(s1)
addi, t1,t1,1 # i++
jal, Foriarraysize
Interval_div2:
srli, t0,t0,1
jal, Whileinterval
#--------------------------------------------------
BOS:
blt, s0,s1,SMALL
beq, s0,s1,Sigsame
jal, BIG # if (sig1 < sig2)
Sigsame:
beq, s2,s3,Expsame
beq, s0,zero,Exp1 # if (sig1 == 0 && sig1 == sig2)
jal, Exp2 # if (sig1 != 0 && sig1 == sig2)
Expsame:
beq, s0,zero,Man1 # if (sig1 == 0 && sig1 == sig2 && exp1 == exp2)
jal, Man2
Exp1:
blt, s2,s3,BIG # exp1 > exp2
jal, SMALL
Exp2:
blt, s2,s3,SMALL # exp1 < exp2
jal, BIG
Man1:
blt, s6,s7,SMALL # man1 > man2
jal, BIG
Man2:
blt, s6,s7,BIG # man1 < man2
jal, SMALL
BIG: # return 1;
addi, s10,zero,1
jal, WhilejandFlag
SMALL: # return 0
addi, s10,zero,0
jal, WhilejandFlag
# sig1 = bf16_1 & 0x80000000;
# sig2 = bf16_2 & 0x80000000;
# exp1 = bf16_1 & 0x7F800000;
# exp2 = bf16_2 & 0x7F800000;
# man1 = bf16_1 & 0x007F0000;
# man2 = bf16_2 & 0x007F0000;
# take two number's sig, exp, man and bf16man
fp32_to_bf16:
and s0,a0,t4
and s1,a0,t5 # sig
and s2,a1,t4
and s3,a1,t5 # exp
and s4,a2,t4
and s5,a2,t5 # man
and s6,a3,t4
and s7,a3,t5 # bf16man
jal BOS
#--------------------------------------------------
Print:
slli, s0,t6,2
add, s1,sp,s0
lw, a0,0(s1)
li, a7,2
ecall
addi, t6,t6,1
la, a0,_EOL
li, a7,4
ecall
beq, t6,s11,End
jal, Print
End:
addi, sp,sp,20
li, a7,10
ecall
```
##### output

## Analysis & Pipeline
generated from Ripes
```
00000000 <main>:
0: fec10113 addi x2 x2 -20
4: 10000417 auipc x8 0x10000
8: ffc42403 lw x8 -4 x8
c: 10000497 auipc x9 0x10000
10: ff84a483 lw x9 -8 x9
14: 10000917 auipc x18 0x10000
18: ff492903 lw x18 -12 x18
1c: 10000997 auipc x19 0x10000
20: ff09a983 lw x19 -16 x19
24: 10000a17 auipc x20 0x10000
28: feca2a03 lw x20 -20 x20
2c: 10000a97 auipc x21 0x10000
30: fe8aaa83 lw x21 -24 x21
34: 10000517 auipc x10 0x10000
38: fe852503 lw x10 -24 x10
3c: 10000597 auipc x11 0x10000
40: fe45a583 lw x11 -28 x11
44: 10000617 auipc x12 0x10000
48: fe062603 lw x12 -32 x12
4c: 10000697 auipc x13 0x10000
50: fdc6a683 lw x13 -36 x13
54: 10000717 auipc x14 0x10000
58: fd872703 lw x14 -40 x14
5c: 00812023 sw x8 0 x2
60: 00912223 sw x9 4 x2
64: 01212423 sw x18 8 x2
68: 01312623 sw x19 12 x2
6c: 01412823 sw x20 16 x2
70: 01512a23 sw x21 20 x2
74: 10000d97 auipc x27 0x10000
78: fa4dad83 lw x27 -92 x27
7c: 004000ef jal x1 4 <ShellSort>
00000080 <ShellSort>:
80: 10000297 auipc x5 0x10000
84: f982a283 lw x5 -104 x5
88: 0012d293 srli x5 x5 1
0000008c <Whileinterval>:
8c: 0e028263 beq x5 x0 228 <Print>
90: 00500333 add x6 x0 x5
00000094 <Foriarraysize>:
94: 07b30063 beq x6 x27 96 <Interval_div2>
98: 006003b3 add x7 x0 x6
9c: 00231413 slli x8 x6 2
a0: 008104b3 add x9 x2 x8
a4: 0004af03 lw x30 0 x9
000000a8 <ReadData_Temp_done>:
a8: 40538e33 sub x28 x7 x5
ac: 002e1413 slli x8 x28 2
b0: 008104b3 add x9 x2 x8
b4: 0004ae83 lw x29 0 x9
b8: 094000ef jal x1 148 <fp32_to_bf16>
000000bc <WhilejandFlag>:
bc: 020d0063 beq x26 x0 32 <arrayjtotemp>
c0: 0053ee63 bltu x7 x5 28 <arrayjtotemp>
c4: 00eefeb3 and x29 x29 x14
c8: 00239413 slli x8 x7 2
cc: 008104b3 add x9 x2 x8
d0: 01d4a023 sw x29 0 x9
d4: 405383b3 sub x7 x7 x5
d8: fd1ff0ef jal x1 -48 <ReadData_Temp_done>
000000dc <arrayjtotemp>:
dc: 00ef7f33 and x30 x30 x14
e0: 00239413 slli x8 x7 2
e4: 008104b3 add x9 x2 x8
e8: 01e4a023 sw x30 0 x9
ec: 00130313 addi x6 x6 1
f0: fa5ff0ef jal x1 -92 <Foriarraysize>
000000f4 <Interval_div2>:
f4: 0012d293 srli x5 x5 1
f8: f95ff0ef jal x1 -108 <Whileinterval>
000000fc <BOS>:
fc: 04944463 blt x8 x9 72 <SMALL>
100: 00940463 beq x8 x9 8 <Sigsame>
104: 038000ef jal x1 56 <BIG>
00000108 <Sigsame>:
108: 01390663 beq x18 x19 12 <Expsame>
10c: 00040863 beq x8 x0 16 <Exp1>
110: 014000ef jal x1 20 <Exp2>
00000114 <Expsame>:
114: 00040c63 beq x8 x0 24 <Man1>
118: 01c000ef jal x1 28 <Man2>
0000011c <Exp1>:
11c: 03394063 blt x18 x19 32 <BIG>
120: 024000ef jal x1 36 <SMALL>
00000124 <Exp2>:
124: 03394063 blt x18 x19 32 <SMALL>
128: 014000ef jal x1 20 <BIG>
0000012c <Man1>:
12c: 017b4c63 blt x22 x23 24 <SMALL>
130: 00c000ef jal x1 12 <BIG>
00000134 <Man2>:
134: 017b4463 blt x22 x23 8 <BIG>
138: 00c000ef jal x1 12 <SMALL>
0000013c <BIG>:
13c: 00100d13 addi x26 x0 1
140: f7dff0ef jal x1 -132 <WhilejandFlag>
00000144 <SMALL>:
144: 00000d13 addi x26 x0 0
148: f75ff0ef jal x1 -140 <WhilejandFlag>
0000014c <fp32_to_bf16>:
14c: 01d57433 and x8 x10 x29
150: 01e574b3 and x9 x10 x30
154: 01d5f933 and x18 x11 x29
158: 01e5f9b3 and x19 x11 x30
15c: 01d67a33 and x20 x12 x29
160: 01e67ab3 and x21 x12 x30
164: 01d6fb33 and x22 x13 x29
168: 01e6fbb3 and x23 x13 x30
16c: f91ff0ef jal x1 -112 <BOS>
00000170 <Print>:
170: 002f9413 slli x8 x31 2
174: 008104b3 add x9 x2 x8
178: 0004a503 lw x10 0 x9
17c: 00200893 addi x17 x0 2
180: 00000073 ecall
184: 001f8f93 addi x31 x31 1
188: 10000517 auipc x10 0x10000
18c: ea850513 addi x10 x10 -344
190: 00400893 addi x17 x0 4
194: 00000073 ecall
198: 01bf8463 beq x31 x27 8 <End>
19c: fd5ff0ef jal x1 -44 <Print>
000001a0 <End>:
1a0: 01410113 addi x2 x2 20
1a4: 00a00893 addi x17 x0 10
1a8: 00000073 ecall
```
### IF stage
- From `PC` is `0x0`. We can know the instruction in IF is `0: fec10113 addi x2 x2 -20`, which is mapped to `addi sp,sp,-20`
- And `PC` is `0x4`. Because there is no branch happen.
- By `IR`, we can get the machine code `0xfec10113` from `Instr. memory`

- `PC` will be `PC+4`

### ID stage
- The `addi x2 x2 -20 (addi sp,sp,-20)` is an I-type instruction. The `OP code` for `addi` is `0010011` , `func3` is `000` and `-20` in binary is `11111111111111111111111111101100` and `0xffffffec` in hex. So the machine is
|IMM|rs1|func3|rd|op|
|:-:|:-:|:-:|:-:|:-:|
|111111101100|00010|000|00010|0010011|
And `0xfec10113` in hex.
After decoder, the processor get the `OPcode` `0010011` and know that this is `addi` instruction. And destination register is `x2` which is map to sp register.

### EX stage
1. The `addi x2 x2 -20` would add `x2` and `imm` , which is `0x7fffffff0` and `0xffffffec` and the result is `0x7fffffdc`

and upper immediate value `0x10000000`. Then the result is `0x10000000`
2. The `7c: 004000ef jal x1 4 <ShellSort>` set the `PC` to new position

### MEM stage
1. The `5c: 00812023 sw x8 0 x2` instruction store data from the register of `x8`, which is `s0` to address `0(x2)` which is `0(sp)`.and data in `x8` is `0x3fcccccd` and address `0(x2)` is `0x7fffffdc`

and `Read out` is `0x00000000`
### WB stage
1. The `0: fec10113 addi x2 x2 -20` instruction write the EX stage result `0x7fffffdc` to the register `x2`.

