owned this note
owned this note
Published
Linked with GitHub
# 2022q1 Homework1 (quiz1)
contributed by < [maromasamsa](https://github.com/maromaSamsa) >
> [作業說明](https://hackmd.io/@sysprog/linux2022-quiz1#%E6%B8%AC%E9%A9%97-1)
###### tags: `linux2022`
## 測驗一: Two Sum Ploblem - Hash Table solution
### 回答問題
```c=
void map_add(map_t *map, int key, void *data)
{
...
// AAA
n->next = first;
//
if (first)
first->pprev = &n->next;
h->first = n;
// BBB
n->pprev = &h->first;
//
}
```
#### AAA
`n` 為新加入 hash table 中的資料節點,新加入的節點會放在 hash table bucket 的最前端,這邊 `n->next` 的資料型態為 `struct hlist_node *` , 因此設定為指向原本的第一位資料節點。
#### BBB
這邊主要做的動作是將新加入 hash table 中的資料節點 `n` 向前與 `hash table bucket` 連接,這邊 `n->pprev` 的資料型態為 `struct hlist_node **` 是指標的指標,所指向的資料會是 「指標的地址」,因此使用 `&` 取得 `h->first` ,即 `struct hlist_node *`的地址。
:::success
簡單的理解,指標指向的是「某資料的地址」,指標的指標指向的是「指標的地址」
:::
### Hash table 實作及解釋程式碼運作原理
#### 宣告結構
```c=
#include<stdlib.h>
#include<stdio.h>
struct hlist_node
{
struct hlist_node *next, **pprev;
};
struct hlist_head
{
struct hlist_node *first;
};
typedef struct
{
int bits; // this map has 2^bits hlist_head
struct hlist_head *ht;
}map_t;
```
#### 初始化 Hash Table
建立並初始化一個有 $2^{bits}$ 個欄位的 Hash Table
```c=
#define MAP_HASH_SIZE(bits) (1 << bits)
map_t *map_init(int bits)
{
map_t *map = malloc(sizeof(map_t));
if(!map)
return NULL;
map->bits = bits;
map->ht = malloc(sizeof(struct hlist_head) * MAP_HASH_SIZE(map->bits));
// if malloc return not null, set point to NULL
if (map->ht) {
for(int i = 0; i < MAP_HASH_SIZE(map->bits); ++i)
(map->ht)[i].first = NULL;
} else {
free(map);
map = NULL;
}
return map;
}
```
```graphviz
digraph {
node[shape=record]
splines = false
rankdir = "TB"
subgraph cluster01 {
label="map_t"
subgraph bucket{
ht_bit [label = "{hlist_head[ 2^bits - 1 ] | <f>first}"]
ht_n [label = "{<h>hlist_head[...] | first}"]
ht_1 [label = "{<h>hlist_head[1] | first}"]
ht_0 [label = "{<h>hlist_head[0] | first}"]
}
bits [label="bits"]
}
}
```
> Hash Table 結構示意圖
---
#### Hash key 的定義
`container_of` 的[使用](https://hackmd.io/@sysprog/linux-macro-containerof),我們可以將此結構視為是一個繼承 `hlist_node` 的類別,負責存收錄進 hash Table 的數值與其 hash key,在查找時我們只走訪 `hlist_node node` ,需要取得該節點的更多資訊再使用 `container_of`
```c=
struct hash_key {
int key;
void* data;
struct hlist_node node;
};
#define container_of(ptr, type, member) \
({ \
void *_mptr = (void *)(ptr); \
((type *)(_mptr - offsetof(type, member))); \
})
```
```graphviz
digraph {
node[shape=record]
splines = false
rankdir = "LR"
list_head[label = "<m>list_head | <n>*first"]
hlist_0 [ label = " <hash>hash_key | {<k>key | <d>*data} | <nd>hlist_node | {<p>**pprev | <n>*next}", group = list]
hlist_1 [ label = " <hash>hash_key | {<k>key | <d>*data} | <nd>hlist_node | {<p>**pprev | <n>*next}", group = list]
hlist_0 -> hlist_1 [
weight = 10, style = invis
]
NULL[shape = plaintext, label = "NULL"]
list_head:n -> hlist_0:nd
hlist_0:p -> list_head:n
hlist_0:n -> hlist_1:nd
hlist_1:p -> hlist_0:n
hlist_1:n -> NULL
}
```
> Hash Key 結構示意圖
---
#### 節點搜尋
```c=
#define GOLDEN_RATIO_32 0x61C88647
static inline unsigned int hash(unsigned int val, unsigned int bits) {
/* High bits are more random, so use them. */
return (val * GOLDEN_RATIO_32) >> (32 - bits);
}
/**
* @brief 從雜湊表中尋找並回傳一節點 kn,且 kn.key == @key
*
* 將 @key 經過雜湊函數計算後得到所屬欄位,
* 接著依序走訪該欄位並檢查目標資料 @key 是否存在於該欄位中,
* 若有找到則回傳包含該資料的節點,若不存在則回傳 NULL。
*
* @param map 目標雜湊表
* @param key 目標節點的 key
* @return struct hash_key*
*/
static struct hash_key *find_key(map_t *map, int key)
{
if(!map)
return NULL;
struct hlist_head *head = &(map->ht)[hash(key, map->bits)];
for(struct hlist_node *p = head->first; p; p = p->next){
struct hash_key *kn = container_of(p, struct hash_key, node);
if(kn->key == key)
return kn;
}
return NULL;
}
```
#### 查找雜湊表以取得節點資料
```c=
/**
* @brief 從雜湊表中提取 key 與 @key 相符之節點 kn 的資料 data
*
* @param map 目標雜湊表
* @param key 數列 nums 中的某數 k
* @return void*
*/
void *map_get(map_t *map, int key)
{
struct hash_key *kn = find_key(map, key);
return kn ? kn->data : NULL;
}
```
#### 資料存入雜湊表
```c=
/**
* @brief 建立一節點 kn 來保存 @key 以及 @data
*
* @param map 目標雜湊表
* @param key 數列 nums 中的某數 k
* @param data k 在數列 nums 中的 index
*/
void map_add(map_t *map, int key, void *data)
{
struct hash_key *kn = find_key(map, key);
if (kn)
return; // 所有節點的 key 不應重複
// 分配空間以及將資料複製到該節點
kn = malloc(sizeof(struct hash_key));
kn->key = key, kn->data = data;
/**
* @brief 將新增的節點插入到對應欄位的首位
*
* 首先透過雜湊函式取得目標欄位的地址 h
* 以及該欄位的首位節點 first
* 最後再為 h->next, n, first 重新建立連結
*
* - 新的節點 n 需要設定 pprev 以及 next
* - n->next 指向原本的首位節點 first
* - n->pprev 則指向欄位 h 指向首位節點的結構體地址 &h->first
*
* - 欄位 h 應指向新的首位節點 n
* - 原首位節點 first 的 pprev 改指向新節點 n 的 next 的地址 &n->next
*
*/
struct hlist_head *h = &map->ht[hash(key, map->bits)];
struct hlist_node *n = &kn->node, *first = h->first;
n->next = first;
if (first)
first->pprev = &n->next;
h->first = n;
n->pprev = &h->first;
}
```
```graphviz
digraph {
node[shape=record]
splines = false
rankdir = "LR"
subgraph cluster01 {
label="map_t"
bits [label="bits"]
subgraph bucket{
label = "hlist_head *ht"
ht_0 [label = "hlist_head[0] | <f>*first"]
ht_1 [label = "hlist_head[1] | <f>*first"]
ht_n [label = "hlist_head[...] | <f>*first"]
ht_bit [label = "hlist_head[ 2^bits - 1 ] | <f>*first"]
}
}
node_0 [ label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
node_01 [ label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
// node_1 [ label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
node_n [ label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
node_bit [ label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
ht_0:f -> node_0:m
// ht_1:f -> node_1:m
ht_1:f -> NULL_1
ht_n:f -> node_n:m
ht_bit:f -> node_bit:m
node_0:p -> ht_0:f
node_01:p -> node_0:n
// node_1:p -> ht_1:f
node_n:p -> ht_n:f
node_bit:p -> ht_bit:f
NULL_0[shape = plaintext, label = "NULL"]
NULL_1[shape = plaintext, label = "NULL"]
NULL_3[shape = plaintext, label = "NULL"]
dot [shape = plaintext, label = "..."]
node_0:n -> node_01:m
node_01:n -> NULL_0
// node_1:n -> NULL_1
node_n:n -> dot
node_bit:n -> NULL_3
ht_0 -> node_0 -> node_01 [
weight = 10, style = invis
]
}
```
> 指標操示意圖,我們觀察若是節存入欄位 `hlist_head[1]` 的流程
```graphviz
digraph {
rankdir = "LR"
node [shape = record]
NULL[shape = plaintext, label = "NULL"]
ht_1 [label = "<h>hlist_head[1] | <f>*first"]
node_0 [ label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
ht_1:f -> NULL
h [label = "*h", fillcolor = "Turquoise", style = filled]
n [label = "*n", fillcolor = "Turquoise", style = filled]
first [label = "*first", fillcolor = "Turquoise", style = filled]
h -> ht_1:h
first -> NULL
n -> node_0:m
ht_1:h -> n [
style = invis
]
NULL -> n [
style = invis
]
}
```
```c=
n->next = first;
```
```graphviz
digraph {
rankdir = "LR"
node [shape = record]
NULL[shape = plaintext, label = "NULL"]
ht_1 [label = "<h>hlist_head[1] | <f>*first"]
node_0 [label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
ht_1:f -> NULL
// NULL -> node_0:p [
// style = invis
// ]
h [label = "*h", fillcolor = "Turquoise", style = filled]
n [label = "*n", fillcolor = "Turquoise", style = filled]
first [label = "*first", fillcolor = "Turquoise", style = filled]
h -> ht_1:h
first -> NULL
n -> node_0:m
node_0:n -> NULL
}
```
```c=
if (first) // false, do nothing
first->pprev = &n->next; // skip
h->first = n;
```
```graphviz
digraph {
rankdir = "LR"
node [shape = record]
NULL[shape = plaintext, label = "NULL"]
ht_1 [label = "<h>hlist_head[1] | <f>*first"]
node_0 [label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
h [label = "*h", fillcolor = "Turquoise", style = filled]
n [label = "*n", fillcolor = "Turquoise", style = filled]
first [label = "*first", fillcolor = "Turquoise", style = filled]
h -> ht_1:h
first -> NULL
n -> node_0:m
node_0:n -> NULL
ht_1:f -> node_0:m
}
```
```c=
n->pprev = &h->first;
```
```graphviz
digraph {
rankdir = "LR"
splines = false
node [shape = record]
ht_1 [label = "<h>hlist_head[1] | <f>*first"]
node_0 [label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
NULL[shape = plaintext, label = "NULL"]
h [label = "*h", fillcolor = "Turquoise", style = filled]
n [label = "*n", fillcolor = "Turquoise", style = filled]
first [label = "*first", fillcolor = "Turquoise", style = filled]
h -> ht_1:h
first -> NULL
n -> node_0:m
ht_1:f -> node_0:m
node_0:n -> NULL
node_0:p -> ht_1:f
ht_1 -> node_0 [
weight = 10, style = invis
]
}
```
> 如果 `*first` 有指向某節點,像是 `hlist_head[0]` 的狀況,則原本是 `NULL` 的地方為該節點,並且判別式變為真,執行以下程式碼:
```c=
if (first) // true
first->pprev = &n->next; // red arrow the diagram below
```
```graphviz
digraph {
rankdir = "LR"
splines = false
node [shape = record]
ht_0 [label = "<h>hlist_head[0] | <f>*first"]
node_0 [label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
node_1 [label = "<m>hlist_node | {<p>**pprev | <n>*next}"]
NULL[shape = plaintext, label = "..."]
h [label = "*h", fillcolor = "Turquoise", style = filled]
n [label = "*n", fillcolor = "Turquoise", style = filled]
first [label = "*first", fillcolor = "Turquoise", style = filled]
h -> ht_0:h
first -> node_1:m
n -> node_0:m
ht_0:f -> node_0:m
node_0:n -> node_1:m
node_1:n -> NULL
node_1:p -> node_0:n [color = red]
node_0:p -> ht_0:f
ht_0 -> node_0 -> node_1 [
weight = 10, style = invis
]
}
```
---
:::warning
雖然理解了指標的指標 `**pprev` 如何使用,但是我仍不理解為什麼需要定義它在雜湊表之中,直接捨棄不用不是更節省空間嗎?因為不論是查找雜奏表或者加入新的節點,似乎都不需要往回訪問前面的節點,想到的可能是將節點插入特定位置,如某一個節點之前,或對每一個欄位進行排序,才需要 `**pprev` 來達到快速交換節點的效果,但是這對於雜湊表來說似乎不是必要的,與其對於每一個欄位進行排序,以提高 Collision 時節點的查找效率,不如直接擴充欄位數量,或者改變 hash function 使其對於資料輸出更趨近隨機的分類,根本性的減少 Collision 發生的機率。
:::
#### 清除雜湊表
```c=
void map_deinit(map_t *map)
{
if (!map)
return;
for (int i = 0; i < MAP_HASH_SIZE(map->bits); ++i) {
struct hlist_head *head = &map->ht[i];
for (struct hlist_node *p = head->first; p; ) {
struct hash_key *kn = container_of(p, struct hash_key, node);
struct hlist_node *n = p;
p = p->next;
// if (!n->pprev)
// goto bail;
// struct hlist_node *next = n->next;
// struct hlist_node **pprev = n->pprev;
// if (next)
// next->pprev = pprev;
// n->next = NULL;
// n->pprev =NULL;
//bail:
free(kn->data);
free(kn);
}
}
free(map->ht);
free(map);
}
```
#### 測試使用雜湊表解決 Two Sum 問題
```c=
int main()
{
int nums[] = {1,4,5,10,23,87,96,111,45,11};
int target = 12;
int *res = malloc(sizeof(int) * 2);
map_t *map = map_init(2); // bocket size 2^2 = 4
for(int i = 0; i< 10; ++i){
int *p = map_get(map, target - nums[i]);
if(p) {
res[0] = *p;
res[1] = i;
break;
} else {
p = malloc(sizeof(int));
*p = i;
map_add(map, nums[i], p);
}
}
printf("{%d, %d} \n", res[0], res[1]);
free(res);
map_deinit(map);
return 0;
}
```
```shell=
$ gcc -o out Hashmap.c
$ ./out
{0, 9}
// 以下顯示雜湊表內部
0 -> [ 8 5 ]
1 -> [ 7 0 ]
2 -> [ 6 1 ]
3 -> [ 4 3 2 ]
```
### 研讀 Linux 核心原始程式碼在 [Hash Table](https://github.com/torvalds/linux/blob/master/include/linux/hashtable.h) 上的實作
:::info
在以前,我查找程式碼的方式只能依賴 `ctrl + F` 或編輯器提供的`find all refferece` 。這次學習到有關查找特定檔案以及程式碼的方式(雖然很基本,但我現在才會):
- 在本機 (linux 作業系統) 使用指令 locate
```shell
$ locate hashtable.h
/usr/src/linux-hwe-5.13-headers-5.13.0-40/include/linux/hashtable.h
/usr/src/linux-hwe-5.13-headers-5.13.0-40/include/linux/rhashtable.h
/usr/src/linux-hwe-5.13-headers-5.13.0-51/include/linux/hashtable.h
/usr/src/linux-hwe-5.13-headers-5.13.0-51/include/linux/rhashtable.h
```
- 直接上 GitHub 左上角搜尋欄 [in this repository 選項]
:::
#### `hlist_head` 以及 `hlist_node`
定義在 `linux/types.h`
```c=
// linux/types.h
struct hlist_head {
struct hlist_node *first;
};
struct hlist_node {
struct hlist_node *next, **pprev;
};
```
宣告為一個大小為 `2^bits` 的 `hlist_head` 陣列:
```c=
// linux/hashtable.h
#define DECLARE_HASHTABLE(name, bits) \
struct hlist_head name[1 << (bits)]
static inline void __hash_init(struct hlist_head *ht, unsigned int sz)
{
unsigned int i;
for (i = 0; i < sz; i++)
INIT_HLIST_HEAD(&ht[i]); //(&ht[i])->first = NULL
}
/**
* hash_init - initialize a hash table
* @hashtable: hashtable to be initialized
*
* Calculates the size of the hashtable from the given parameter, otherwise
* same as hash_init_size.
*
* This has to be a macro since HASH_BITS() will not work on pointers since
* it calculates the size during preprocessing.
*/
#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable))
```
結構與前面自定義的雜湊表大致相同,除了自定義的雜湊表有使用 `map_t` 將 `bits` 資訊包含其中
:::info
$Q$: 為什麼欄位的大小要定義為 $2^{bits}$ ?
[**-參考文章:HashMap 初始容量為何是2的n次方**](https://www.gushiciku.cn/pl/a1Pd/zh-tw)
一般將新的節點分配給雜湊表的欄位時,我們通常在 hash function 的最後,會將數值 `X` 取欄位個數的餘數作為最終的輸出,理想的狀態下,最終輸出數值的範圍便會均勻的分佈在 $[0, lengh-1]$ 間。當這個欄位數量 `lengh` 為 $2^{bits}$ 時,以下等式成立:
   $X \mod lengh \equiv X \wedge (lengh - 1)$
使用二進制實際推演會發現 $2^n$ 只在第 `n+1` 位元數為 `1` ,也就是說 $2^n$ 會使包含 `n` 位以下的位元為 `1` ,例如:
   $0b10000 - 1 = 0b01111$
此情況下取餘數會等價於,將轉成二進制的 `X` 高於 `n` 位的位數去除。相較於餘數運算, bit wise 的運算更有效率,可以更快找到新加入節點所屬的欄位。
:::
#### Hash Function
在核心中主要的 Hash Function 定義叫做 `hash_main(val, bits)`,從所需的參數可以知道欄位的數量也是因子之一。
```c=
// linux/hashtable.h
/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */
#define hash_min(val, bits) \
(sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) // hash function contains gloden ratio
```
```c=
// linux/hash.h
#define GOLDEN_RATIO_32 0x61C88647
...
/*
* The _generic versions exist only so lib/test_hash.c can compare
* the arch-optimized versions with the generic.
*
* Note that if you change these, any <asm/hash.h> that aren't updated
* to match need to have their HAVE_ARCH_* define values updated so the
* self-test will not false-positive.
*/
#ifndef HAVE_ARCH__HASH_32
#define __hash_32 __hash_32_generic
#endif
static inline u32 __hash_32_generic(u32 val)
{
return val * GOLDEN_RATIO_32;
}
#ifndef HAVE_ARCH_HASH_32
#define hash_32 hash_32_generic
#endif
static inline u32 hash_32_generic(u32 val, unsigned int bits)
{
/* High bits are more random, so use them. */
return __hash_32(val) >> (32 - bits);
}
```
結合上面探討的問題 **為什麼欄位的大小要定義為 $2^{bits}$ ?** 可以發現核心直接使用效率高的右移運算 `>>` 實現將數值分配在定義大小為 $2^{bits}$ 的範圍之內,稍有不同的是核心的寫法是選擇留下高位元的數值,理由是高位元會更加隨機。
#### 其他實作
- 加入新的節點至雜湊表中的方法有除了上面自己實作的 `hlist_add_head` ,還有其他變體如 `hlist_add_before`, `hlist_add_behind`, `hlist_add_before` 等,這邊稍微觀察 `hlist_add_head` 與上面自己實作的 `hlist_add` 差異:
```c=
// linux/list.h
/**
* hlist_add_head - add a new entry at the beginning of the hlist
* @n: new entry to be added
* @h: hlist head to add it after
*
* Insert a new entry after the specified head.
* This is good for implementing stacks.
*/
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
{
struct hlist_node *first = h->first;
WRITE_ONCE(n->next, first);
if (first)
WRITE_ONCE(first->pprev, &n->next);
WRITE_ONCE(h->first, n);
WRITE_ONCE(n->pprev, &h->first);
}
```
結構上一致,但我好奇 `WRITE_ONCE` 是什麼,發現它定義於 `linux/compiler.h` 中,主要是確保在多執行序的情形下不會讀取初錯誤的數值,與其相對的也有定義 `READ_ONCE`
[解釋來源](https://stackoverflow.com/questions/34988277/write-once-in-linux-kernel-lists)
```c=
// linux/compiler.h
#define WRITE_ONCE(var, val) \
(*((volatile typeof(val) *)(&(var))) = (val))
```
- 在刪除節點的實作上有 `hlist_del` 與 `hlist_del_init` ,前者有點像是 `remove` 只是被移出雜湊表,但被移除的節點指標會指向一段被定義好的的位置而不是指向 `NULL` ,如果要重新利用這個節點就使用`hlist_del_init` , 這會 **`unhash`** 節點並且讓其 `next`, `pprev` 都指向 `NULL`
:::warning
**`unhash`** 的定義?
觀察以下程式碼,似乎只要節點的 `pprev` 指向 `NULL` 就算是 `unhash`,而 `next == NULL` 有可能是表示他是串列中的最後一個節點。這算是其中一個 `pprev` 需要定義的理由嗎?
```c=
// linux/hashtable.h
/**
* hash_hashed - check whether an object is in any hashtable
* @node: the &struct hlist_node of the object to be checked
*/
static inline bool hash_hashed(struct hlist_node *node)
{
return !hlist_unhashed(node); // === return node->pprev
}
```
:::
- `for_each` 系列
- `hash_for_each`
- `hlist_for_each_entry`
- `hash_for_each_safe`
- `hlist_for_each_entry_safe`
族繁不及備載,基本上跟在 `list_for_each` 系列看到的用法差不多。
:::warning
RCU 的意思
:::
### 閱讀文件 [ How does the kernel implements Hashtables?](https://kernelnewbies.org/FAQ/Hashtables)
## 測驗二: [Remove Duplicates from Sorted List II](https://leetcode.com/problems/remove-duplicates-from-sorted-list-ii/)
**以下實作都先不考慮記憶體釋放 (free)**
### Linux 風格
使用雙重指標處理 `head` 變動的情況
```c=
/**
* Definition for singly-linked list.
* struct ListNode {
* int val;
* struct ListNode *next;
* };
*/
struct ListNode *deleteDuplicates(struct ListNode *head)
{
if (!head || !head->next)
return head;
struct ListNode **pptr = &head;
struct ListNode *current = head;
while (current) {
if (current->next && current->next->val == current->val) {
current = current->next;
while (current->next && current->next->val == current->val) // consider more than two dup_
current = current->next;
current = current->next; // must point to distinct node
/* head = current (when first time call)
* head->next = current (when second call)
*/
*pptr = current;
} else {
pptr = &((*pptr)->next); // first time calls, pptr = &(head->next)
current = current->next;
}
}
return head;
}
```
### 以類似 Linux 核心的 circular doubly-linked list 改寫,撰寫遞迴和迭代 (iterative) 的程式碼
```c=
/**
* define as a circular doubly-linked list
* struct ListNode{
* int val;
* struct ListNode *pre, *next;
* };
*/
struct ListNode *deleteDuplicates(struct ListNode *head)
{
if (!head || head->next == head->pre)
return head;
struct ListNode **pptr = &head;
struct ListNode *current = head;
do {
if (current->next->val == current->val) {
current = current->next;
while (current->next != head && current->next->val == current->val)
current = current->next;
current = current->next;
if(current == *pptr) // if whole nodes are the same
return NULL;
*pptr = current;
} else {
pptr = &((*pptr)->next);
current = current->next;
}
} while (current->next != head);
return head;
}
```