Try   HackMD

2020q3 Homework6 (quiz6)

contributed by < Rsysz >

tags: sysprog

測驗題

1.

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

在深度學習中, weights,gradients activations 等各種 parameter 皆涉及滿滿的浮點數運算,因此各種加速運算的方法推陳出新,FP16TPU 便是 Google 所設計的加速技巧,能為 model training 帶來以下的好處。

  • 搭配專用硬體 TPU 能大幅加速運算速度
  • 因各種 parameter 占用空間的大幅下降,16GB的RAM能當32G的用
  • 加速 training 過程,使 cycle 的時間縮短,加速模型迭代
  • 雖然以前有研究表示降低浮點數的精度將會導致模型準確度下降,但根據 Google 所做的實驗若只針對 activations 採用 FP16,將能加速運算且減少對模型準確度的影響。
float fp32tobf16(float x) {
    float y = x;
    int *py = (int *) &y;
    unsigned int exp, man;
    exp = *py & 0x7F800000u;
    man = *py & 0x007FFFFFu;
    if (!exp && !man) /* zero */           
        return x;
    if (exp == 0x7F800000u) /* infinity or NaN */
        return x;

    /* Normalized number. round to nearest */
    float r = x;
    int *pr = (int *) &r;
    *pr &= BB1;
    r /= 256;
    y = x + r;

    *py &= BB2;
    return y;
}

根據BF16wiki兩者的結構如下圖所示

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

  • 轉換前針對 NaNINF0 等特例先行排除
  • 利用BB1 = (a) 0x7f800000 取出 Exp 部分,根據 Exp 部分數值
    2Exp8
    帶入
  • y=x+r=2Exp×1.XX+2Exp8=2Exp(1.XX+28)
    代表若將 BF32 展開,
    28
    部分若有值,將會對 Fraction 部分進位






SR



BF16

0

0

1

1

1

1

1

0

0

0

1

0

0

0

0

0

0

...



num

2^-8



BF16:f16->num





Sign

Sign



BF16:f0->Sign





Exp

Exp



BF16:f1->Exp





BF16:f8->Exp





Fraction

Fraction



BF16:f9->Fraction





BF16:f15->Fraction





  • 最後利用BB2 = (e) 0xffff0000 截斷後方 16 個 Bits (
    28
    223
    )

2.

#define RINGBUF_DECL(T, NAME) \
    typedef struct {          \
        int size;             \
        int start, end;       \
        T *elements;          \
    } NAME

#define RINGBUF_INIT(BUF, S, T)             \
    {                                       \
        static T static_ringbuf_mem[S + 1]; \
        BUF.elements = static_ringbuf_mem;  \
    }                                       \
    BUF.size = S;                           \
    BUF.start = 0;                          \
    BUF.end = 0;

#define NEXT_START_INDEX(BUF) \
    (((BUF)->start != (BUF)->size) ? ((BUF)->start + RB1) : 0)
#define NEXT_END_INDEX(BUF) (((BUF)->end != (BUF)->size) ? ((BUF)->end + RB2) : 0)

#define is_ringbuf_empty(BUF) ((BUF)->end == (BUF)->start)
#define is_ringbuf_full(BUF) (NEXT_END_INDEX(BUF) == (BUF)->start)

#define ringbuf_write_peek(BUF) (BUF)->elements[(BUF)->end]
#define ringbuf_write_skip(BUF)                   \
    do {                                          \
        (BUF)->end = NEXT_END_INDEX(BUF);         \
        if (is_ringbuf_empty(BUF))                \
            (BUF)->start = NEXT_START_INDEX(BUF); \
    } while (0)

#define ringbuf_read_peek(BUF) (BUF)->elements[(BUF)->start]
#define ringbuf_read_skip(BUF) (BUF)->start = NEXT_START_INDEX(BUF);

#define ringbuf_write(BUF, ELEMENT)        \
    do {                                   \
        ringbuf_write_peek(BUF) = ELEMENT; \
        ringbuf_write_skip(BUF);           \
    } while (0)

#define ringbuf_read(BUF, ELEMENT)        \
    do {                                  \
        ELEMENT = ringbuf_read_peek(BUF); \
        ringbuf_read_skip(BUF);           \
    } while (0)

dowhile(0)的妙用

  • 首先看到 RINGBUF_DECLRINGBUF_INIT,宣告了一個 RINGBUF 結構並在 RINGBUF_INIT 內建立輸入 S + 1 大小的陣列,將startend 標記元素 0

  • 接著看到 ringbuf_write,呼叫了 ringbuf_write_peekringbuf_write_skip

  • ringbuf_write_peek 將元素存入 (BUF)->elements[(BUF)->start]

  • ringbuf_write_skip 則用來判斷 startend 元素位置的更動







SR



BF16

5

...



s

start



s->BF16:f0





e

end



e->BF16:f0





BF16_d

 

 

...



s_d

start



s_d->BF16_d:f0





e_d

end



e_d->BF16_d:f1





但若 S = 2 陣列則為3,此時寫入 3







SR



BF16

5

4

 



s

start



s->BF16:f0





e

end



e->BF16:f2





BF16_d

5

4

3



s_d

start



s_d->BF16_d:f0





e_d

end



e_d->BF16_d:f0





BF16_t

5

4

3



s_t

start



s_t->BF16_t:f1





e_t

end



e_t->BF16_t:f0





因此可以不斷循環寫入,覆蓋先前的資料

  • 所以RB1 = (a) 1, RB2 = (a) 1

3.

考慮到以下靜態初始化的 singly-linked list 實作:

#include <stdio.h>

/* clang-format off */
#define cons(x, y) (struct llist[]){{y, x}}
/* clang-format on */

struct llist {
    int val;
    struct llist *next;
};

void sorted_insert(struct llist **head, struct llist *node)
{
    if (!*head || (*head)->val >= node->val) {
        SS1;
        SS2;	
        return;
    }
    struct llist *current = *head;
    while (current->next && current->next->val < node->val)
        current = current->next;
    node->next = current->next;
    current->next = node;
}

void sort(struct llist **head)
{
    struct llist *sorted = NULL;
    for (struct llist *current = *head; current;) {
        struct llist *next = current->next;
        sorted_insert(&sorted, current);
        current = next;
    }
    *head = sorted;
}

int main()
{
    struct llist *list = cons(cons(cons(cons(NULL, A), B), C), D);
    struct llist *p;
    for (p = list; p; p = p->next)
        printf("%d", p->val);
    printf("\n");

    sort(&list);
    for (p = list; p; p = p->next)
        printf("%d", p->val);
    printf("\n");
    return 0;
}

執行結果為:

9547
4579
  • 這題在實作靜態初始化的 link list,首先看到 #define cons(x, y) (struct llist[]){{y, x}}struct llist *list = cons(cons(cons(cons(NULL, A), B), C), D) 這邊偷偷在 define 內調換了 x, y 的位置,所以 *next = x, val = y,所以其實結構是 D->C->B->A->NULL ,因此
    D = (a) 9
    C = (b) 5
    B = (c) 4
    A = (d) 7
  • 接著看到 sortsorted_insert,從執行結果我們可以確定這是一個 increasing ordersorting!*head || (*head)->val >= node->val 判斷當 sorted 為空或 head 的數值大於當前 node 時,將 node 插入 head 因此
    SS1 = (d) node->next = *head
    SS2 = (b) *head = node
  • 而下面部分則在做剩下的判斷,如果 node 大於 head,則繼續做由小到大的 insertion sort

4.

LeetCode 287. Find the Duplicate Number 給定一個整數序列,其中會有一個數值重複,請找出。
已知條件:

  1. 假設陣列長度為
    n
    ,數值的範圍是
    1
    n1
  2. 重複的數值不一定只重複一次

考慮以下程式碼是可能的解法:

int findDuplicate(int *nums, int numsSize)
{
    int res = 0;
    const size_t log_2 = 8 * sizeof(int) - __builtin_clz(numsSize);
    for (size_t i = 0; i < log_2; i++) {
        int bit = 1 << i;
        int c1 = 0, c2 = 0;
        for (size_t k = 0; k < numsSize; k++) {
            if (k & bit)
                ++c1;
            if (nums[k] & bit)
                ++c2;
        }
        if (CCC)
            res += bit;
    }
    return res;
}

log_2 = 8 * sizeof(int) - __builtin_clz(numsSize)

利用 bit 位移逐個 bit 做檢查並利用

  • c1 紀錄給定的陣列目錄該 bit 有幾個 1
  • c2 紀錄給定的陣列內的數值該 bit 有幾個 1
    因此,若重複的數在該 bit0c1 >= c2,若重複的數在該 bit1c1 < c2 所以 CCC = (b) c1 < c2
  • 舉例來說,nums = [3, 1, 3, 2]
num binary
0 000
1 001
2 010
3 011
c1 022
num binary
3 011
1 001
3 011
2 010
c2 033

走訪全數陣列成員後,

res=20+21=3

延伸問題

int findDuplicate(int* nums, int numsSize){
    bool list[numsSize];
    memset(list, 0, sizeof(list));
    for(int i = 0; i < numsSize; i++){
        list[nums[i]] ^= 1;
        if(!list[nums[i]])
            return nums[i];
    }
    return -1;
}


透過 bool list[numsSize] 建立一個 Flags arrary,當重複的數出現時 trigger,算是比較直觀簡單的做法。