2020q3 Homework6 (quiz6)

contributed by < Rsysz >

tags: `sysprog`

測驗題

1.

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

在深度學習中， weights,gradients activations 等各種 parameter 皆涉及滿滿的浮點數運算，因此各種加速運算的方法推陳出新，FP16 與 TPU 便是 Google 所設計的加速技巧，能為 model training 帶來以下的好處。

搭配專用硬體 TPU 能大幅加速運算速度
因各種 parameter 占用空間的大幅下降，16GB的RAM能當32G的用
加速 training 過程，使 cycle 的時間縮短，加速模型迭代
雖然以前有研究表示降低浮點數的精度將會導致模型準確度下降，但根據 Google 所做的實驗若只針對 activations 採用 FP16，將能加速運算且減少對模型準確度的影響。

float fp32tobf16(float x) {
    float y = x;
    int *py = (int *) &y;
    unsigned int exp, man;
    exp = *py & 0x7F800000u;
    man = *py & 0x007FFFFFu;
    if (!exp && !man) /* zero */           
        return x;
    if (exp == 0x7F800000u) /* infinity or NaN */
        return x;

    /* Normalized number. round to nearest */
    float r = x;
    int *pr = (int *) &r;
    *pr &= BB1;
    r /= 256;
    y = x + r;

    *py &= BB2;
    return y;
}

根據BF16wiki兩者的結構如下圖所示

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

轉換前針對 NaN，INF，0 等特例先行排除
利用BB1 = (a) 0x7f800000 取出 Exp 部分，根據 Exp 部分數值
$2^{E x p - 8}$ 帶入
$y = x + r = 2^{E x p} \times 1. X X + 2^{E x p - 8} = 2^{E x p} (1. X X + 2^{- 8})$ 代表若將 BF32 展開，
$2^{- 8}$ 部分若有值，將會對 Fraction 部分進位

最後利用BB2 = (e) 0xffff0000 截斷後方 16 個 Bits (
$2^{- 8}$ 至
$2^{- 23}$ )

2.

#define RINGBUF_DECL(T, NAME) \
    typedef struct {          \
        int size;             \
        int start, end;       \
        T *elements;          \
    } NAME

#define RINGBUF_INIT(BUF, S, T)             \
    {                                       \
        static T static_ringbuf_mem[S + 1]; \
        BUF.elements = static_ringbuf_mem;  \
    }                                       \
    BUF.size = S;                           \
    BUF.start = 0;                          \
    BUF.end = 0;

#define NEXT_START_INDEX(BUF) \
    (((BUF)->start != (BUF)->size) ? ((BUF)->start + RB1) : 0)
#define NEXT_END_INDEX(BUF) (((BUF)->end != (BUF)->size) ? ((BUF)->end + RB2) : 0)

#define is_ringbuf_empty(BUF) ((BUF)->end == (BUF)->start)
#define is_ringbuf_full(BUF) (NEXT_END_INDEX(BUF) == (BUF)->start)

#define ringbuf_write_peek(BUF) (BUF)->elements[(BUF)->end]
#define ringbuf_write_skip(BUF)                   \
    do {                                          \
        (BUF)->end = NEXT_END_INDEX(BUF);         \
        if (is_ringbuf_empty(BUF))                \
            (BUF)->start = NEXT_START_INDEX(BUF); \
    } while (0)

#define ringbuf_read_peek(BUF) (BUF)->elements[(BUF)->start]
#define ringbuf_read_skip(BUF) (BUF)->start = NEXT_START_INDEX(BUF);

#define ringbuf_write(BUF, ELEMENT)        \
    do {                                   \
        ringbuf_write_peek(BUF) = ELEMENT; \
        ringbuf_write_skip(BUF);           \
    } while (0)

#define ringbuf_read(BUF, ELEMENT)        \
    do {                                  \
        ELEMENT = ringbuf_read_peek(BUF); \
        ringbuf_read_skip(BUF);           \
    } while (0)

do…while(0)的妙用

首先看到 RINGBUF_DECL 與 RINGBUF_INIT，宣告了一個 RINGBUF 結構並在 RINGBUF_INIT 內建立輸入 S + 1 大小的陣列，將start 與 end 標記元素 0
接著看到 ringbuf_write，呼叫了 ringbuf_write_peek 與 ringbuf_write_skip
ringbuf_write_peek 將元素存入 (BUF)->elements[(BUF)->start]
ringbuf_write_skip 則用來判斷 start 與 end 元素位置的更動

但若 S = 2 陣列則為3，此時寫入 3

因此可以不斷循環寫入，覆蓋先前的資料

所以RB1 = (a) 1， RB2 = (a) 1

3.

考慮到以下靜態初始化的 singly-linked list 實作:

#include <stdio.h>

/* clang-format off */
#define cons(x, y) (struct llist[]){{y, x}}
/* clang-format on */

struct llist {
    int val;
    struct llist *next;
};

void sorted_insert(struct llist **head, struct llist *node)
{
    if (!*head || (*head)->val >= node->val) {
        SS1;
        SS2;	
        return;
    }
    struct llist *current = *head;
    while (current->next && current->next->val < node->val)
        current = current->next;
    node->next = current->next;
    current->next = node;
}

void sort(struct llist **head)
{
    struct llist *sorted = NULL;
    for (struct llist *current = *head; current;) {
        struct llist *next = current->next;
        sorted_insert(&sorted, current);
        current = next;
    }
    *head = sorted;
}

int main()
{
    struct llist *list = cons(cons(cons(cons(NULL, A), B), C), D);
    struct llist *p;
    for (p = list; p; p = p->next)
        printf("%d", p->val);
    printf("\n");

    sort(&list);
    for (p = list; p; p = p->next)
        printf("%d", p->val);
    printf("\n");
    return 0;
}

執行結果為:

9547
4579

這題在實作靜態初始化的 link list，首先看到 #define cons(x, y) (struct llist[]){{y, x}} 與 struct llist *list = cons(cons(cons(cons(NULL, A), B), C), D) 這邊偷偷在 define 內調換了 x, y 的位置，所以 *next = x, val = y，所以其實結構是 D->C->B->A->NULL ，因此
D = (a) 9
C = (b) 5
B = (c) 4
A = (d) 7
接著看到 sort 與 sorted_insert，從執行結果我們可以確定這是一個 increasing order 的 sorting，!*head || (*head)->val >= node->val 判斷當 sorted 為空或 head 的數值大於當前 node 時，將 node 插入 head 因此
SS1 = (d) node->next = *head
SS2 = (b) *head = node
而下面部分則在做剩下的判斷，如果 node 大於 head，則繼續做由小到大的 insertion sort

4.

LeetCode 287. Find the Duplicate Number 給定一個整數序列，其中會有一個數值重複，請找出。
已知條件：

假設陣列長度為
$n$ ，數值的範圍是
$1$ 到
$n - 1$
重複的數值不一定只重複一次

考慮以下程式碼是可能的解法:

int findDuplicate(int *nums, int numsSize)
{
    int res = 0;
    const size_t log_2 = 8 * sizeof(int) - __builtin_clz(numsSize);
    for (size_t i = 0; i < log_2; i++) {
        int bit = 1 << i;
        int c1 = 0, c2 = 0;
        for (size_t k = 0; k < numsSize; k++) {
            if (k & bit)
                ++c1;
            if (nums[k] & bit)
                ++c2;
        }
        if (CCC)
            res += bit;
    }
    return res;
}

log_2 = 8 * sizeof(int) - __builtin_clz(numsSize)

利用 bit 位移逐個 bit 做檢查並利用

c1 紀錄給定的陣列目錄該 bit 有幾個 1
c2 紀錄給定的陣列內的數值該 bit 有幾個 1
因此，若重複的數在該 bit 為 0 則 c1 >= c2，若重複的數在該 bit 為 1 則 c1 < c2 所以 CCC = (b) c1 < c2
舉例來說，nums = [3, 1, 3, 2]

num	binary
0	000
1	001
2	010
3	011
c1	022

num	binary
3	011
1	001
3	011
2	010
c2	033

走訪全數陣列成員後，

r e s = 2^{0} + 2^{1} = 3

延伸問題

int findDuplicate(int* nums, int numsSize){
    bool list[numsSize];
    memset(list, 0, sizeof(list));
    for(int i = 0; i < numsSize; i++){
        list[nums[i]] ^= 1;
        if(!list[nums[i]])
            return nums[i];
    }
    return -1;
}

透過 bool list[numsSize] 建立一個 Flags arrary，當重複的數出現時 trigger，算是比較直觀簡單的做法。

2020q3 Homework6 (quiz6)

tags: sysprog

1.

2.

3.

4.

延伸問題

Read more

MTK / Bluetooth

ONVM-IPS / Hyperscan

VPP & DPDK

2020q3 Gameboy-emu

tags: `sysprog`