# 2020q3 Homework6 (quiz6) [toc] ## 測驗 1 觀察幾位同學的成果後,我覺得本題有幾點特別要注意的地方 1. 雖然 fp32tobf16 是將 32 bit floating point 轉換為 bfloat16 的 function 但回傳值仍為 32 bit floating point ,而 bfloat16 則是放在靠近 MSB 的 16 bits 2. 對於零、正負無限大、NaN 都保持值不變,對 normalized 進行 rounding ,對 denormalized 無條件捨去 ```c= float fp32tobf16(float x) { float y = x; int *py = (int *) &y; unsigned int exp, man; exp = *py & 0x7F800000u; man = *py & 0x007FFFFFu; if (!exp && !man) /* zero */ return x; if (exp == 0x7F800000u) /* infinity or NaN */ return x; /* Normalized number. round to nearest */ float r = x; int *pr = (int *) &r; *pr &= 0xff800000; r /= 256; y = x + r; *py &= 0xffff0000; return y; } ``` ### 第 3 行 "c 語言中,floating point 不能做bitwise 運算" 從 sammer1107 同學得知 經測試會出現 ```error: invalid operands to binary << (have ‘float’ and ‘unsigned int’)``` 訊息,```<<```可為其他 bitwise 運算子 因此先轉型成 int 才操作 (使用 int 指標來操作此記憶體位置) ### 第 5,6 行 將上述的值透過 bitwise and 來分別取得 exponent 與 fraction(似乎又稱為 mantissa) ### 第 7 行 因為非零的值做 not 皆為零,因此只有 exp 與 man 皆為零才符合條件 若原始值為零,則直接回傳,不須處理 ### 第 9 行 當 exponent 全為 1 ,直接回傳 這麼做可能會出錯,雖然正負無限大和部分 NaN 可以直接去除低位的 16 bits 而不改變,但部分 NaN 的 fraction 會因此變成 0 ,NaN 就變成正負無限大了 我認為應該改成 ```c= if (exp == 0x7F800000u){ /* infinity or NaN */ man = (man << 7) | (man << 14) | (man << 21); man = man & 0x007F0000u; *py = exp | man; return y; } ``` 這是我能想到效率最好的方法 將會被捨去的 bits 的內容透過 bitwise or 移到剩下的 7 bits fraction 不過 sammer1107同學的 ```c= if (exp == 0x7F800000u) { /* infinity or NaN */ // make sure the mantissa is non-zero if NaN *py |= ((man != 0) << 16); *py &= 0xFFFF0000; return y; } ``` 感覺能再省下一些計算 ### 第 15 行 尚未處理的數值剩下 normalized 與 denormalized 將輸入值 and 0xff800000 對於 normalized 會留下相同 exponent 但 fraction 為 0 的值,也就是 $1 \times 2^{exponent-bias}$ ,將其除以 256 也會將 exponent 減 8,也可能將整個數變成 0 (當 exponent 小於等於 8 時) 對於 denormalized 則全為 0 ,因為 denormalized 是假定 fraction 與小數點之前為 0 的情況 將處理過的 r 加回去時,那些 exponent 被減 8 的值會剛好使原始值 fraction 第 8 bit 被加 1 ,達到四捨五入的效果 為何 denormalized 就不做 rounding ? 我覺得應該加上 ```c= *pr &= 0x80000000; *pr |= ((exp == 0) << 15); y += r; ``` 但之後我在參考連結中看到設計的團隊提及 denormals 的處理方式為直接轉為 0 似乎是為了轉換速度而再犧牲一些精度 DP to SP 轉換 有處理 denormals ```c= double fp64tofp32(double x) { double y = x; long *py = (long *) &y; unsigned long exp62_60, exp, man; exp62_60 = *py & 0x7000000000000000u; exp = *py & 0x7FF0000000000000u; man = *py & 0x000FFFFFFFFFFFFFu; if (!exp && !man){ /* zero */ return x; } if (exp == 0x7FF0000000000000u){ /* infinity or NaN */ *py &= 0xFF80000000000000; *py |= (long)(man != 0) << 32; return y; } /* Normalized and Denormalized number. round to nearest */ double r = x; unsigned long *pr = (long *) &r; if(exp62_60){ *py &= 0x8000000000000000; *py |= 0x7F80000000000000; return y; } *pr &= 0xFFF0000000000000; r /= 2097152; y = x + r; *pr &= 0x8000000000000000; long tmp = (exp == 0); *pr |= (tmp << 28); y += r; *py = *py << 3; *py |= *pr; *py &= 0xFFFFFFFF00000000; return y; } ``` 測試程式 ```c= void print_hex(float x) { int *p = (int *) &x; printf("%08x", *p); } void print_long_hex(double x) { long *p = (long *) &x; printf("%016lx", *p); } int main() { unsigned int sign[] = {0x00000000, 0x80000000}; unsigned int exp[]={0x00000000, 0x07000000, 0x00800000, 0x7f800000}; unsigned int man[]={0x00000000, 0x00400000, 0x00000001, 0x007fffff}; unsigned int data=0; printf("fp32tobf16\n"); for (int i = 0; i < sizeof(sign) / sizeof(sign[0]); i++) { for (int j = 0; j < sizeof(exp) / sizeof(exp[0]); j++){ for (int k = 0; k < sizeof(man) / sizeof(man[0]); k++) { data=0; data|=sign[i]; data|=exp[j]; data|=man[k]; float *a=(float *)&data; printf("sign is %08x\n",sign[i]); printf("exp is %08x\n",exp[j]); printf("man is %08x\n",man[k]); printf("fp32 %f = ", *a); print_hex(*a); printf("\n"); float bf_a = fp32tobf16(*a); printf("bfloat \t\t"); print_hex(bf_a); printf("\n\n"); } } } printf("fp64tofp32\n"); unsigned long lsign[] = {0x0000000000000000, 0x8000000000000000}; unsigned long lexp[]={0x0000000000000000, 0x7000000000000000, 0x0010000000000000, 0x7ff0000000000000}; unsigned long lman[]={0x0000000000000000, 0x0008000000000000, 0x0000000010000000, 0x0000000000000001, 0x000fffffffffffff}; unsigned long ldata=0; for (int i = 0; i < sizeof(lsign) / sizeof(lsign[0]); i++) { for (int j = 0; j < sizeof(lexp) / sizeof(lexp[0]); j++){ for (int k = 0; k < sizeof(lman) / sizeof(lman[0]); k++) { ldata=0; ldata|=lsign[i]; ldata|=lexp[j]; ldata|=lman[k]; double *a=(double *)&ldata; printf("sign is %016lx\n",lsign[i]); printf("exp is %016lx\n",lexp[j]); printf("man is %016lx\n",lman[k]); printf("fp64 %lf = ", *a); print_long_hex(*a); printf("\n"); *a = fp64tofp32(*a); ldata = ldata >> 32; unsigned int data = (unsigned int)ldata; float *fp32 = (float *)&data; printf("fp32 %f = 00000000", *fp32); print_hex(*fp32); printf("\n\n"); } } } return 0; } ``` 主要是針對特殊值來做測試 正負號、 exponent 全為 0 或 1、 exponent 數值中等或極小、 mantissa 全為 0 或 1、 mantissa MSB or LSB or 被捨棄部分的 MSB 為 1 等等的排列組合 其中值得注意的是 ``` sign = 0x0000000000000000 exp = 0x0010000000000000 man = 0x000fffffffffffff output 0x01000002 ``` 這筆測資 原始值 `0x001fffffffffffff` 在加上 rounding 值 `0x0000000080000000` 後,原先的 matissa 由`1.ffffffffffffffff` 變成 `2.000000007fffffff` ,因為要維持小數點左側為 1 因此向右 shift 1,此時就先做了一次 rounding 導致結果從 `1.000000003fffffff8`(對 8 rounding) 變成 `1.0000000040000000` 而`0x0020000040000000`向右 shift 3 (exp 少 3 bits) 傳換為 fp32 後變成 `0x01000002` ### 批次處理 為了批次處理先將原始的轉換 function 去除條件判斷,並調整呼叫及回傳的形式,也許光是這樣執行效能就有差別 ```c= void fp32tobf16_noif(float* fp32, float* ans, int size) { for (int i = 0; i < size; i++) { float y = fp32[i]; unsigned int* py = (unsigned int*)&y; float r = fp32[i]; unsigned int* pr = (unsigned int*)&r; unsigned int exp, man; exp = *py & 0x7F800000u; man = *py & 0x007FFFFFu; //NaN *py |= ((exp == 0x7F800000u && man != 0) << 16); //Normals rounding *pr &= 0xff800000; r /= 256; y = fp32[i] + r; //Denormals rounding *pr &= 0x80000000; *pr |= ((exp == 0 && man != 0) << 15); y += r; *py &= 0xffff0000; ans[i] = y; } } ``` 對應的批次轉換 function ```c= void fp32tobf16_batch(float* fp32, float* ans, int size) { for (int i = 0; i < size; i += 4) { unsigned int sign[4] = { 0x80000000u, 0x80000000u , 0x80000000u , 0x80000000u }; unsigned int exp[4] = { 0x7f800000u, 0x7f800000u , 0x7f800000u , 0x7f800000u }; unsigned int man[4] = { 0x007FFFFFu, 0x007FFFFFu , 0x007FFFFFu , 0x007FFFFFu }; unsigned int zero[4] = { 0x0u, 0x0u , 0x0u , 0x0u }; float normal_round_div[4] = { 256, 256, 256, 256 }; unsigned int denormal_round[4] = { 0x00008000u, 0x00008000u, 0x00008000u, 0x00008000u }; unsigned int bit16[4] = { 0xffff0000u, 0xffff0000u, 0xffff0000u, 0xffff0000u }; float out[4]; __m128 y = _mm_load_ps(&fp32[i]); __m128 r = _mm_load_ps(&fp32[i]); __m128 cons_sign = _mm_load_ps((float*)sign); __m128 cons_exp = _mm_load_ps((float*)exp); __m128 cons_man = _mm_load_ps((float*)man); __m128 cons_zero = _mm_load_ps((float*)zero); __m128 m_exp = _mm_and_ps(y, cons_exp); __m128 m_man = _mm_and_ps(y, cons_man); //NaN __m128 exp_all_1 = _mm_cmpeq_ps(m_exp, cons_exp); __m128 man_not_0 = _mm_cmpneq_ps(m_man, cons_zero); __m128 nan_fix = _mm_and_ps(exp_all_1, man_not_0); nan_fix = _mm_and_ps(nan_fix, cons_man); y = _mm_or_ps(y, nan_fix); //Normals rounding __m128 nrd = _mm_load_ps(normal_round_div); r = _mm_or_ps(_mm_and_ps(r, cons_sign), _mm_and_ps(r, cons_exp)); r = _mm_div_ps(r, nrd); y = _mm_add_ps(y, r); //Denormals rounding __m128 dr = _mm_load_ps((float*)denormal_round); r = _mm_and_ps(r, cons_sign); r = _mm_or_ps(r, _mm_and_ps(_mm_and_ps(_mm_cmpeq_ps(m_exp, cons_zero), man_not_0), dr));//*pr |= ((exp == 0 && man != 0) << 15); _mm_store_ps(out, r); y = _mm_add_ps(y, r); __m128 mbit16 = _mm_load_ps((float*)bit16); y = _mm_and_ps(y, mbit16); _mm_store_ps(&ans[i], y); } } ``` 結果 ![](https://i.imgur.com/Zf43bIy.png) ![](https://i.imgur.com/VS35eqe.png) 批次處理一次將 4 個 fp32 一起轉換,但是花費時間卻低於原本的 $\dfrac{1}{4}$ ,還沒找出原因 ## 測驗 2 ```c= #define RINGBUF_DECL(T, NAME) \ typedef struct { \ int size; \ int start, end; \ T *elements; \ } NAME ``` 這樣設計可以使這個 struct 更具彈性,可調整 ring buffer 使用的資料型態 題目中使用的為 int ```c= #define RINGBUF_INIT(BUF, S, T) \ { \ static T static_ringbuf_mem[S + 1]; \ BUF.elements = static_ringbuf_mem; \ } \ BUF.size = S; \ BUF.start = 0; \ BUF.end = 0; ``` 依照設定初始化 my_buf 的起、終點和容量,並宣告 buffer 的記憶體空間 將 my_buf.elements 指標設定到該記憶體空間起始位置 ```c= #define NEXT_START_INDEX(BUF) \ (((BUF)->start != (BUF)->size) ? ((BUF)->start + 1) : 0) ``` 算出 start 的下一個位置,若超出範圍則設回 buffer 起始位置 ```c= #define NEXT_END_INDEX(BUF) (((BUF)->end != (BUF)->size) ? ((BUF)->end + 1) : 0) ``` 算出 end 的下一個位置,若超出範圍則設回 buffer 起始位置 ```c= #define is_ringbuf_empty(BUF) ((BUF)->end == (BUF)->start) ``` 皆由判斷 end start 是否重疊來知道 buffer 是否為空 ```c= #define is_ringbuf_full(BUF) (NEXT_END_INDEX(BUF) == (BUF)->start) ``` 若 end 的下一個位置為 start 就表示 buffer 是滿的 ```c= #define ringbuf_write_peek(BUF) (BUF)->elements[(BUF)->end] ``` 取得 end 的位置 ```c= #define ringbuf_write_skip(BUF) \ do { \ (BUF)->end = NEXT_END_INDEX(BUF); \ if (is_ringbuf_empty(BUF)) \ (BUF)->start = NEXT_START_INDEX(BUF); \ } while (0) ``` 將 end 往下一格移動,若重疊到 start 則 start 也往下一格移動 ```c= #define ringbuf_read_peek(BUF) (BUF)->elements[(BUF)->start] ``` 取得 start 的位置 ```c= #define ringbuf_read_skip(BUF) (BUF)->start = NEXT_START_INDEX(BUF); ``` 將 start 往下一格移動 ```c= #define ringbuf_write(BUF, ELEMENT) \ do { \ ringbuf_write_peek(BUF) = ELEMENT; \ ringbuf_write_skip(BUF); \ } while (0) ``` 在 end 寫入資料並調整 end 位置 ```c= #define ringbuf_read(BUF, ELEMENT) \ do { \ ELEMENT = ringbuf_read_peek(BUF); \ ringbuf_read_skip(BUF); \ } while (0) ``` 讀取 start 所在的 buffer 的值並調整 start 的位置 ### do { ... } while (0) 使用的考量 為了解決使用時會發生 dangling else 的問題,同時避免 statements 被分開到不同 scope ```c= #define statements(x,y) statement1;statement2 if(True) add_two_num(a,b); ``` 展開成 ```c= if(True) statement1; statement2; ``` ```c= #define statements(x,y) {statement1;statement2} if(True) add_two_num(a,b); else statement3; ``` 展開成 ```c= if(True){ statement1; statement2; }; else statement3; ``` ## 測驗 3