2022q1 Homework1 (quiz1)

contributed by < leewei05 >

tags: `linux2022`

測驗 1

延伸問題:

解釋上述程式碼運作原理
研讀 Linux 核心原始程式碼 include/linux/hashtable.h 及對應的文件 How does the kernel implements Hashtables?，解釋 hash table 的設計和實作手法，並留意到 tools/include/linux/hash.h 的 GOLDEN_RATIO_PRIME，探討其實作考量

解釋程式碼運作原理

初始化 map_t 物件，並根據 MAP_HASH_SIZE 初始化 hash table slots 的數量。

#include <stddef.h>
#include <stdlib.h>

struct hlist_node { struct hlist_node *next, **pprev; };
struct hlist_head { struct hlist_node *first; };
typedef struct { int bits; struct hlist_head *ht; } map_t;

#define MAP_HASH_SIZE(bits) (1 << bits) 
map_t *map_init(int bits) {
    map_t *map = malloc(sizeof(map_t));
    if (!map)
        return NULL;

    map->bits = bits;
    map->ht = malloc(sizeof(struct hlist_head) * MAP_HASH_SIZE(map->bits));
    if (map->ht) {
        for (int i = 0; i < MAP_HASH_SIZE(map->bits); i++)
            (map->ht)[i].first = NULL;
    } else {
        free(map);
        map = NULL;
    }
    return map;
}

struct hash_key {
    int key;
    void *data;
    struct hlist_node node;
};

#define container_of(ptr, type, member)               \
    ({                                                \
        void *__mptr = (void *) (ptr);                \
        ((type *) (__mptr - offsetof(type, member))); \
    })

為什麼要用 Fibonacci Hashing? 待整理

Linux Kernel Hash Table Behavior

找出對應的論文和技術報告，而非陳年網頁。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

給定 hash key 並返回 hash value。

#define GOLDEN_RATIO_32 0x61C88647
static inline unsigned int hash(unsigned int val, unsigned int bits) {
    /* High bits are more random, so use them. */
    return (val * GOLDEN_RATIO_32) >> (32 - bits);
}

假設 hash table 有回傳 hash key 則走訪該 hash slot 的 linked list 並回傳指定的 key。

static struct hash_key *find_key(map_t *map, int key) {
    struct hlist_head *head = &(map->ht)[hash(key, map->bits)];
    for (struct hlist_node *p = head->first; p; p = p->next) {
        struct hash_key *kn = container_of(p, struct hash_key, node);
        if (kn->key == key)
            return kn;
    }
    return NULL;
}

如果找到指定的 key 則回傳 *data。

void *map_get(map_t *map, int key)
{
    struct hash_key *kn = find_key(map, key);
    return kn ? kn->data : NULL;
}

如果 key 已存在於 hash table 則不執行 add 的操作，如果 key 不存在，則差入新的 node 至 list 的 head。

void map_add(map_t *map, int key, void *data)
{
    struct hash_key *kn = find_key(map, key);
    if (kn)
        return;

    kn = malloc(sizeof(struct hash_key));
    kn->key = key, kn->data = data;

    struct hlist_head *h = &map->ht[hash(key, map->bits)];
    struct hlist_node *n = &kn->node, *first = h->first;
    n->next = first;
    if (first)
        first->pprev = &n->next;
    h->first = n;
    n->pprev = &h->first;
}

map_deinit 重新初始化 map，並釋放各個 hash node 的記憶體空間。

void map_deinit(map_t *map)
{
    if (!map)
        return;

    for (int i = 0; i < MAP_HASH_SIZE(map->bits); i++) {
        struct hlist_head *head = &map->ht[i];
        for (struct hlist_node *p = head->first; p;) {
            struct hash_key *kn = container_of(p, struct hash_key, node);
            struct hlist_node *n = p;
            p = p->next;

            if (!n->pprev) /* unhashed */
                goto bail;

            struct hlist_node *next = n->next, **pprev = n->pprev;
            *pprev = next;
            if (next)
                next->pprev = pprev;
            n->next = NULL, n->pprev = NULL;

        bail:
            free(kn->data);
            free(kn);
        }
    }
    free(map);
}
 
int *twoSum(int *nums, int numsSize, int target, int *returnSize)
{
    map_t *map = map_init(10);
    *returnSize = 0;
    int *ret = malloc(sizeof(int) * 2);
    if (!ret)
        goto bail;

    for (int i = 0; i < numsSize; i++) {
        int *p = map_get(map, target - nums[i]);
        if (p) { /* found */
            ret[0] = i, ret[1] = *p;
            *returnSize = 2;
            break;
        }

        p = malloc(sizeof(int));
        *p = i;
        map_add(map, nums[i], p);
    }

bail:
    map_deinit(map);
    return ret;
}

避免在程式碼列表中撰寫漢語註解，適度切割程式碼列表，斟酌解釋即可。你以後可能會採用上述程式碼來發表，現在就做好專業作品的準備。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

Linux Kernal Hash table 筆記

理想上，Hash function 是可以平均分散物件在各個 bucket，但實際寫入的物件還是會有碰撞 (collision) 的產生。當有碰撞產生時，也就是不同物件被分到同一個 bucket，這些物件會透過 linked list 的結構串連起來。

我們可以在自定義的結構體加入 hlist_node 以便操作 Linux Kernel 的 Hash table API。hlist_node 結構體的最後一個物件為 NULL，不同於 cyclic linked list 的 list_node。每一個 hash table 的 bucket 都是一個 struct hlist_head 的指標，也是每一個 linked list 的 head。

操作 Hash Table API

以下實作一個簡單的 hash table module 來操作 hash table API。參考 Linux Kernel Programming Guide 以及 How to use the kernel hashtable API?:

#include <linux/hashtable.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>

struct h_node {
    int data;
    struct hlist_node n;
};

DEFINE_HASHTABLE(ht, 3);

static u32 hash_func(int value, int size)
{
    u32 key = 0;
    key = value % size;

    return key;
}

static int __init h_init(void)
{
    struct h_node node_a, node_b, node_c, *curr;
    unsigned bkt;

    pr_info("Hello, hash table\n");
    pr_info("Is hash table empty? %u", hash_empty(ht));

    node_a.data = 4;
    node_b.data = 5;
    node_c.data = 12;

    u32 key_a = hash_func(4, 8);
    u32 key_b = hash_func(5, 8);
    u32 key_c = hash_func(12, 8);

    pr_info("key_a = %u\n", key_a);
    pr_info("key_b = %u\n", key_b);
    pr_info("key_c = %u\n", key_c);

    hash_init(ht);

    hash_add(ht, &node_a.n, key_a);
    hash_add(ht, &node_b.n, key_b);
    hash_add(ht, &node_c.n, key_c);

    hash_for_each(ht, bkt, curr, n) {
        pr_info("hash: data = %d, node = %p \n", curr->data, &curr->n);
    }

    hash_del(&node_a.n);
    hash_del(&node_b.n);
    hash_del(&node_c.n);

    return 0;
}

static void __exit h_exit(void)
{
    pr_info("Bye, hash table\n");
}

module_init(h_init);
module_exit(h_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Lee Wei");

dmesg 結果

[ 2331.129222] Bye, hash table
[ 2349.439579] Hello, hash table
[ 2349.439583] Is hash table empty? 1
[ 2349.439584] key_a = 4
[ 2349.439584] key_b = 5
[ 2349.439585] key_c = 4
[ 2349.439585] hash: data = 12, node = 000000000cc8bf70
[ 2349.439587] hash: data = 4, node = 0000000030e8aeef
[ 2349.439588] hash: data = 5, node = 0000000009367c18
[ 2409.755636] Bye, hash table

Golden Ratio Prime

測驗 2

延伸問題:

嘗試避免遞迴，寫出同樣作用的程式碼
以類似 Linux 核心的 circular doubly-linked list 改寫，撰寫遞迴和迭代 (iterative) 的程式碼

#include <stddef.h>

struct ListNode {
    int val;
    struct ListNode *next;
};

struct ListNode *deleteDuplicates(struct ListNode *head)
{
    if (!head)
        return NULL;

    if (head->next && head->val == head->next->val) {
        /* Remove all duplicate numbers */
        while (head->next && head->val == head->next->val)
            head = head->next;
        return deleteDuplicates(head->next);
    }

    head->next = deleteDuplicates(head->next);
    return head;
}

迭代版本

#include <stddef.h>

struct ListNode {
    int val;
    struct ListNode *next;
};

struct ListNode *deleteDuplicates(struct ListNode *head)
{
    if (!head)
        return NULL;
        
}

測驗 3

延伸問題:

解釋上述程式碼的運作，撰寫完整的測試程式，指出其中可改進之處並實作
在 Linux 核心找出 LRU 相關程式碼並探討

LRUCache 結構體為 Head，跟實際存 key, value 的 LRUNode 分開。LRUNode 以 doubly linked list 的形式跟 LRUNode 串接。

參考 Risheng 同學以及 Linux 核心原始程式碼巨集: container_of 所畫出來的圖進行修改，lRUCacheCreate 所產生的物件：

#include <stdio.h>
#include <stdlib.h>
#include "list.h"

typedef struct {
    int capacity, count;             
    struct list_head dhead, hheads[];
} LRUCache;
    
typedef struct {
    int key, value;
    struct list_head hlink, dlink;
} LRUNode;

LRUCache *lRUCacheCreate(int capacity)
{   
    LRUCache *obj = malloc(sizeof(*obj) + capacity * sizeof(struct list_head));
    obj->count = 0;
    obj->capacity = capacity;
    INIT_LIST_HEAD(&obj->dhead);
    for (int i = 0; i < capacity; i++)
        INIT_LIST_HEAD(&obj->hheads[i]);
    return obj;
}

先從建立新的 Cache Node 開始分析。Hash Function 實作是給定的 key 跟 capacity 取餘數，所以回傳的 hash 為餘數。

根據以下理由推測 MMM4 是 list_entry。

送出表單後，MMM4 是 list_for_each_entry。但參考其他同學的作答筆記，的確 list_last_entry 比較合理。因為不管是 Put, Get，最後都會執行 list_move 把最新存取的 Cache Node 插入至 Head 的下一個。Head 最尾端的 Node 即為 Least Recent Used Node。

假如 count == capacity 則表示 cache 已經達到上限，需要 evict 距離這次操作最久的 node。所以 list_last_entry 回傳一個 LRUNode 物件合理。
三個參數皆符合函式的型別。@node(pointer to list node), @type, @member

list_del() 只會把指定的 node 移出 list 並不會釋放節點的記憶體，所以可以直接指派新帶入的 key 以及 value，並且把新的 node 加入至 cache list 的 head 以及 hash table 的 list。

    if (obj->count == obj->capacity) {
        // MMM4
        lru = list_last_entry(&obj->dhead, LRUNode, dlink);
        list_del(&lru->dlink);
        list_del(&lru->hlink);
    } else {
        lru = malloc(sizeof(LRUNode));
        obj->count++;
    }
    lru->key = key;
    list_add(&lru->dlink, &obj->dhead);
    list_add(&lru->hlink, &obj->hheads[hash]);
    lru->value = value;

假設原本沒有 Cache，增加一個 node 會如下圖

根據以下理由推測 MMM3 是 list_for_each_entry。

推測在新增 cache node 之前要先走訪 hash slot 的 linked list，如果給定的 key 已經存在 hash table，則把該 node 移到 dhead list 的第一個位置，並更新 cache node 的 value。
三個參數皆符合函式的型別。@entry, @head, @member

void lRUCachePut(LRUCache *obj, int key, int value)
{
    LRUNode *lru;
    int hash = key % obj->capacity;
    // MMM3
    list_for_each_entry(lru, &obj->hheads[hash], hlink) {
        if (lru->key == key) {
            list_move(&lru->dlink, &obj->dhead);
            lru->value = value;
            return;
        }
    }

    ...
}

根據以下理由推測 MMM1 是 list_for_each_entry_safe。

最後一行是 free(obj)，在釋放 LRUCache 的記憶體空間前，需要先逐一釋放 LRUNODE
四個參數皆符合函式的型別。@entry, @safe(pointer to LRUNode), @head, @member
list_for_each_entry_safe 允許在迭代的過程中執行 delete，且也是唯一一個有四個參數的函式

void lRUCacheFree(LRUCache *obj)
{       
    LRUNode *lru, *n;
    // MMM1
    list_for_each_entry_safe(lru, n, &obj->dhead, dlink) {
        list_del(&lru->dlink);
        free(lru);
    }
    free(obj); 
}

根據以下理由推測 MMM2 是 list_for_each_entry。

找尋 Cache 時會需要迭代 hash table 的 linked list，故為 for_each 系列的函式
因為 Get 操作並不會去釋放節點的記憶體空間，所以不會是 for_each_safe。而且帶入的參數也不符合 list_for_each_safe
三個參數皆符合函式的型別。@entry, @head, @member

int lRUCacheGet(LRUCache *obj, int key)
{
    LRUNode *lru;
    int hash = key % obj->capacity;
    // MMM2
    list_for_each_entry(lru, &obj->hheads[hash], hlink) {
        if (lru->key == key) {
            list_move(&lru->dlink, &obj->dhead);
            return lru->value;
        }
    }
    return -1;
}

去閱讀 C 語言規格書，避免日後大量出現「原來可以這樣寫」一類的驚訝。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

查詢中
6.7.2.1 Structure and union specifiers

可改進之處

待補

測驗 4

延伸題目:

解釋上述程式碼的運作，撰寫完整的測試程式，指出其中可改進之處並實作
嘗試用 Linux 核心風格的 hash table 重新實作上述程式碼

Leetcode 128. Longest Consecutive Sequence

Given an unsorted array of integers nums, return the length of the longest consecutive elements sequence.
You must write an algorithm that runs in O(n) time.

給定一個未排序的陣列，回傳連續數值的長度。

Constraints
0 <= nums.length <= 10^5
-10^9 <= nums[i] <= 10^9

需要考慮到連續的負值，且因為輸入的陣列長度最長為 10^5 所以需要 O(n) 時間複雜度的演算法。

find 函式會從定義的 hash table 查詢是否存在該數值 num，而 hash key 是根據陣列給定的大小取餘數。

#include <stdio.h>
#include <stdlib.h>
#include "list.h"

struct seq_node {
    int num;
    struct list_head link;
};                                                                                      
static struct seq_node *find(int num, int size, struct list_head *heads)
{   
    struct seq_node *node;
    int hash = num < 0 ? -num % size : num % size;
    list_for_each_entry (node, &heads[hash], link) {
        if (node->num == num)
            return node;
    }
    return NULL;
}

第一個迴圈初始化 n_size 個 head 當作是 hash table 的 hash slot。第二個迴圈把輸入陣列的各個數值加入至 hash table。

int longestConsecutive(int *nums, int n_size)
{
    int hash, length = 0;
    struct seq_node *node;
    struct list_head *heads = malloc(n_size * sizeof(*heads));

    for (int i = 0; i < n_size; i++)
        INIT_LIST_HEAD(&heads[i]);

    for (int i = 0; i < n_size; i++) {
        if (!find(nums[i], n_size, heads)) {
            hash = nums[i] < 0 ? -nums[i] % n_size : nums[i] % n_size;
            node = malloc(sizeof(*node));
            node->num = nums[i];
            list_add(&node->link, &heads[hash]);
        }
    }

利用left, right 來查找下一個連續數值，首先查閱規格書：

6.5.2.4 Postfix increment and decrement operators
2. The result of the postfix ++ operator is the value of the operand. After the result is
obtained, the value of the operand is incremented. (That is, the value 1 of the appropriate
type is added to it.) See the discussions of additive operators and compound assignment
for information on constraints, types, and conversions and the effects of operations on
pointers. The side effect of updating the stored value of the operand shall occur between
the previous and the next sequence point.
3. The postfix – operator is analogous to the postfix ++ operator, except that the value of
the operand is decremented (that is, the value 1 of the appropriate type is subtracted from
it).

6.5.3.1 Prefix increment and decrement operators
2. The value of the operand of the prefix ++ operator is incremented. The result is the new
value of the operand after incrementation. The expression ++E is equivalent to (E+=1).
See the discussions of additive operators and compound assignment for information on
constraints, types, side effects, and conversions and the effects of operations on pointers.
3. The prefix – operator is analogous to the prefix ++ operator, except that the value of the
operand is decremented.

根據上述規格書的描述，left++ 會先帶入 left 現在的值，find 函式回傳之後才會加一，++left 會先加一再帶入 find 函式。

根據以下理由推測 LLL 是 --left，RRR 是 ++right。

不管是 left 還是 right，初始的值都已經是 num，所以 LLL 或 RRR 不會是 left 或是 right
num 所屬的 node 已經被算進 len 所以 LLL, RRR 不會是用 postfix increment，而是用 prefix increment
推測 left 是往比 num 小的數值去查找，直到找不到比 num 還要小的連續數值
推測 right 是往比 num 大的數值去查找，直到找不到比 num 還要大的連續數值

    for (int i = 0; i < n_size; i++) {
        int len = 0;
        int num;
        node = find(nums[i], n_size, heads);
        while (node) {
            len++;
            num = node->num;
            list_del(&node->link);

            int left = num, right = num;
            // LLL
            while ((node = find(--left, n_size, heads))) {
                len++;
                list_del(&node->link);
            }

            // RRR
            while ((node = find(++right, n_size, heads))) {
                len++;
                list_del(&node->link);
            }

            length = len > length ? len : length;
        }
    }

    return length;
}

可改進之處

待補

hash 直接這樣取餘數會不會有問題?
longestConsecutive 光是迴圈就跑了三個，如果陣列很長就會跑很久

2022q1 Homework1 (quiz1)

tags: linux2022

測驗 1

解釋程式碼運作原理

Linux Kernal Hash table 筆記

操作 Hash Table API

Golden Ratio Prime

測驗 2

測驗 3

可改進之處

測驗 4

可改進之處

Read more

Compiler Project(Draft)

11/3 DS Review

2022q1 Homework5 (quiz8)

2022q1 Homework3 (fibdrv)

tags: `linux2022`