contributed by < millaker
>
$gcc --version
gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
$lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
Address sizes: 43 bits physical, 48 bits virtual
CPU(s): 16
On-line CPU(s) list: 0-15
Thread(s) per core: 2
Core(s) per socket: 8
Socket(s): 1
NUMA node(s): 1
Vendor ID: AuthenticAMD
CPU family: 23
Model: 113
Model name: AMD Ryzen 7 3700X 8-Core Processor
Stepping: 0
Frequency boost: enabled
CPU MHz: 1862.963
CPU max MHz: 3600.0000
CPU min MHz: 2200.0000
BogoMIPS: 7186.65
Virtualization: AMD-V
L1d cache: 256 KiB
L1i cache: 256 KiB
L2 cache: 4 MiB
L3 cache: 32 MiB
NUMA node0 CPU(s): 0-15
queue.h
和 list.h
了解 queue node 是如何定義注意程式碼排版風格,以 4 個空白字元進行縮排。大括號 ({
) 的位置要留意
jserv
struct list_head {
struct list_head *next;
struct list_head *prev;
}
struct element_t {
char *value;
struct list_head list;
}
佇列的每個 element 用 list.h
內提供的 struct list head
來連接, struct list head
內含兩個指標分別指向前一個和下一個節點, 如果是空的 list,則 next
和 prev
皆指向目前的 list_head
。
queue.c
程式碼實作struct list_head *q_new()
{
struct list_head *new = malloc(sizeof(struct list_head));
if (!new)
return NULL;
return new->next = new->prev = new;
}
q_new
:Linux Kernel API 有提供 INIT_LIST_HEAD
,較好閱讀。
struct list_head *q_new()
{
struct list_head *new = malloc(sizeof(struct list_head));
if (!new)
return NULL;
INIT_LIST_HEAD(new);
return new;
}
因為 list_head
包含在佇列的 element_t
內, 利用 list_head
走訪每個 element_t
時, 必須得到包含此 list_head
的 element_t
的地址,所以要用到 你所不知道的 C 語言: linked list 和非連續記憶體 內講到的 container_of
巨集。
第一次嘗試的作法如下:
void q_free(struct list_head *l)
{
struct list_head **curr = &l->next;
element_t *curr_ele;
while (*curr != l) {
curr_ele = container_of(*curr, element_t, list);
free(curr_ele->value);
*curr = (*curr)->next;
free(curr_ele);
}
free(l);
}
在執行 make check
時,在程式結尾會出現以下錯誤:
cmd> quit
Freeing queue
Segmentation fault occurred. You dereferenced a NULL or invalid pointer
從 qtest.c
追溯問題,得知 quit
命令會再呼叫一次 q_free
, 如果在執行 quit
前就已經釋放所有佇列節點,傳進去的 list_head
會是 NULL
,於是第一行 &l->next
便會產生 deferencing a NULL pointer 的問題。經過修改,新增一個檢查和把 container_of
換成更容易看懂的 list_entry
。
void q_free(struct list_head *l)
{
if (!l)
return;
struct list_head **curr = &l->next;
element_t *curr_ele;
while (*curr != l) {
curr_ele = list_entry(*curr, element_t, list);
free(curr_ele->value);
*curr = (*curr)->next;
free(curr_ele);
}
free(l);
}
q_insert_head
和 q_insert_tail
這兩個函式的實作方法很相近,所以放在一起,依序檢查 head
、temp
、 val
在傳入或 malloc
過後是否為 NULL
,就不用再多使用free()
來釋放前面已配置的記憶體。malloc
都正確執行後,雖然上課中有講到 strcpy() 和一些常用函式的危險性,但在這邊,分配給val
的記憶體大小都是確定的,不會產生字串大小超過 buffer size
的疑慮。最後再使用Linux Kernel API 所提供的 list_add()
和 list_add_tail
將新的 list_head
插入。
bool q_insert_head(struct list_head *head, char *s)
{
if (!head)
return false;
element_t* temp = malloc(sizeof(element_t));
if (!temp)
return false;
char* val = malloc(sizeof(char) * strlen(s) + 1);
if (!val)
return false;
strcpy(val, s);
temp->value = val;
list_add(&temp->list, head);
return true;
}
bool q_insert_tail(struct list_head *head, char *s)
{
if (!head)
return false;
element_t* temp = malloc(sizeof(element_t));
if (!temp)
return false;
char* val = malloc(sizeof(char) * strlen(s) + 1);
if (!val)
return false;
strcpy(val, s);
temp->value = val;
list_add_tail(&temp->list, head);
return true;
}
詳細閱讀 Linux Kernel API 提供的 list_head
操作巨集及函式,發現有許多好用的功能,
以下實做都會盡量使用,前面可更改的部份後續處理。
這裡用到幾個 list.h
提供的API,list_last_entry
, list_first_entry
, list_del_init
。
可以在 list.h
內觀察到,其實list_last_entry
和 list_first_entry
就是重新包裝過後的 container_of
。
element_t *q_remove_head(struct list_head *head, char *sp, size_t bufsize)
{
if (!head || list_empty(head))
return NULL;
element_t *remove = list_first_entry(head, element_t, list);
if (sp) {
strncpy(sp, remove->value, bufsize);
sp[bufsize - 1] = '\0';
}
list_del_init(&remove->list);
return remove;
}
element_t *q_remove_tail(struct list_head *head, char *sp, size_t bufsize)
{
if (!head || list_empty(head))
return NULL;
element_t *remove = list_last_entry(head, element_t, list);
if (sp) {
strncpy(sp, remove->value, bufsize);
sp[bufsize - 1] = '\0';
}
list_del_init(&remove->list);
return remove;
}
測試將 NULL
傳入 q_remove_head
和 q_remove_tail
時會發生錯誤,發現在開頭檢查少一條件,加上 list_empty(head)
更新: 沒有考慮到移除的字串大小大於 buffer 的情況,需要在結尾補上 '\0'。
和 lab0-牛刀小試 一樣
int q_size(struct list_head *head)
{
if (!head)
return 0;
int len = 0;
struct list_head *li;
list_for_each(li, head)
len++;
return len;
}
Linux Kernel API 中是有提供 list_swap
一函式,但是本次作業中 list.h
並沒有此函式。
void q_swap(struct list_head *head)
{
// https://leetcode.com/problems/swap-nodes-in-pairs/
if (!head)
return;
struct list_head *first = head->next, *second = head->next->next;
while (first != head && second != head && first != second) {
first->prev->next = second;
first->next = second->next;
second->next->prev = first;
second->prev = first->prev;
first->prev = second;
second->next = first;
first = first->next;
second = second->next->next->next;
}
}
因為 list_swap()
有在不只一個地方使用,於是寫成 helper function並取代掉原來的部份,讓程式碼更精簡。
void q_swap(struct list_head *head)
{
// https://leetcode.com/problems/swap-nodes-in-pairs/
if (!head)
return;
struct list_head *first = head->next, *second = head->next->next;
while (first != head && second != head && first != second) {
list_swap(first, second);
first = first->next;
second = second->next->next->next;
}
}
使用兩個 list_head
指標 front, back
分別指向第一個和最後一個 list_head
,每個步驟中分別將 front
往後指一單位 back
往前指一單位。
bool q_delete_mid(struct list_head *head)
{
// https://leetcode.com/problems/delete-the-middle-node-of-a-linked-list/
if (!head || list_empty(head))
return false;
struct list_head *front = head->next, *back = head->prev;
while (front->prev != back && front != back) {
front = front->next;
back = back->prev;
}
list_del(front);
element_t *ele = list_entry(front, element_t, list);
free(ele->value);
free(ele);
return true;
}
發現有其他地方也會需要找到中間節點,所以把它寫成函式 q_find_mid
struct list_head *q_find_mid(struct list_head *head)
{
if(!head || list_empty(head))
return NULL;
struct list_head *front = head->next, *back = head->prev;
while (front->prev != back && front != back) {
front = front->next;
back = back->prev;
}
return front;
}
不過在寫mergesort時,因為將佇列轉成 NULL-teriminated singly-linked list,函式裡的 prev
都會失效,於是參考上課講義內使用兩快慢指標來找到中間節點。
使用兩個指標 curr
和 it
, curr
指向當前要比較的值,由 it
往下將值相同的節點都刪除並加入由 removed
為 head 的一條新佇列,最後在呼叫前面實作的 q_free
將所有重複的節點與 removed
一同釋放。
bool q_delete_dup(struct list_head *head)
{
// https://leetcode.com/problems/remove-duplicates-from-sorted-list-ii/
if (!head || list_empty(head))
return false;
struct list_head *curr = head->next;
struct list_head *it = curr->next;
struct list_head *removed = malloc(sizeof(struct list_head));
INIT_LIST_HEAD(removed);
while (curr != head && it != head) {
element_t *ele_target = list_entry(curr, element_t, list);
element_t *ele_curr = list_entry(it, element_t, list);
if (it != head && !strcmp(ele_target->value, ele_curr->value)) {
while (it != head && !strcmp(ele_target->value, ele_curr->value)) {
ele_curr = list_entry(it->next, element_t, list);
struct list_head *temp = it->next;
list_del(it);
list_add(it, removed);
it = temp;
}
struct list_head *temp = curr->next;
list_del(curr);
list_add(curr, removed);
curr = temp;
it = curr->next;
} else {
curr = curr->next;
it = curr->next;
}
}
q_free(removed);
return true;
}
將每個節點依序放到佇列的尾巴
void q_reverse(struct list_head *head)
{
if(!head || list_empty(head))
return;
struct list_head *node, *safe;
list_for_each_safe(node, safe, head){
list_move(node, head);
}
}
最初的想法是將鏈結串列拆成兩個有 head
的串列,再進行 mergesort,但是在 q_sort 內 malloc
是不允許的。在無法給拆解的兩條串列 head
的情況下,要操作 doubly-linked list 實在非常困難,想了很久實在想不到方法,所以先閱讀 linux/list_sort.c
才知道先將 doubly-linked list
轉成 singly-linked list
。
void q_sort(struct list_head *head)
{
if (!head || list_empty(head))
return;
head->prev->next = NULL;
head->next = merge_sort_list(head->next);
struct list_head *prev = NULL, *curr = head;
while (curr) {
curr->prev = prev;
prev = curr;
curr = curr->next;
}
head->prev = prev;
prev->next = head;
}
struct list_head *merge_sort_list(struct list_head *head)
{
if (!head || !head->next)
return head;
struct list_head *slow = head, *mid;
for (struct list_head *fast = head->next; fast && fast->next;
fast = fast->next->next)
slow = slow->next;
mid = slow->next;
slow->next = NULL;
struct list_head *left = merge_sort_list(head),
*right = merge_sort_list(mid);
return merge(left, right);
}
struct list_head *merge(struct list_head *l1, struct list_head *l2)
{
struct list_head *head = NULL, **ptr = &head, **node;
for (node = NULL; l1 && l2; *node = (*node)->next) {
node = (strcmp(list_entry(l1, element_t, list)->value,
list_entry(l2, element_t, list)->value) < 0)
? &l1
: &l2;
*ptr = *node;
ptr = &(*ptr)->next;
}
if (l1)
*ptr = l1;
else
*ptr = l2;
return head;
}
其中 merge_sort_list
是參考課程講義內 你所部知道的 C 語言:linked list 和非連續記憶體 - Merge Sort 實作 快慢指標找中間節點的方法,這裡無法使用我自己寫的 q_find_mid
是因為已經不是 doubly-circular-linked list,無法直接存取 head->prev
。merge
函式則是使用同一篇講義內 Merge two sorted list 使用 indirect pointer的方法。關於 linux/list_sort.c
內使用的方法目前還在研讀。
以上就是原有功能的實作,在執行 make test
時, trace 17 time complexity test 無法在本機執行通過,但在 github action 上卻過了,觀察其他同學 github action 的動態,相近寫法的程式碼有些通過有些沒通過。
linux/list_sort.c 檔案內有這段註解在解釋使用了什麼特殊的改變,讓 merge sort 能夠減少呼叫 cmp()
的次數,也就是比較節點大小的次數,以下挑選出方法解釋的部份。
The merging is controlled by "count", the number of elements in the
pending lists. This is beautifully simple code, but rather subtle.
...
...
There are six states we distinguish. "x" represents some arbitrary
bits, and "y" represents some arbitrary non-zero bits:
0: 00x: 0 pending of size 2^k; x pending of sizes < 2^k
1: 01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
2: x10x: 0 pending of size 2^k; 2^k + x pending of sizes < 2^k
3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
4: y00x: 1 pending of size 2^k; 2^k + x pending of sizes < 2^k
5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
(merge and loop back to state 2)
We gain lists of size 2^k in the 2->3 and 4->5 transitions (because
bit k-1 is set while the more significant bits are non-zero) and
merge them away in the 5->2 transition. Note in particular that just
before the 5->2 transition, all lower-order bits are 11 (state 3),
so there is one list of each smaller size.
這裡使用一個 int count
來追蹤有多少已排序好的串列在 pending lists 中,如果有兩條長度為
程式碼實作的部份:
head->prev->next = NULL;
先將佇列改成 singly-linked list
,指標 prev 另作它用。
do {
size_t bits;
struct list_head **tail = &pending;
/* Find the least-significant clear bit in count */
for (bits = count; bits & 1; bits >>= 1)
tail = &(*tail)->prev;
/* Do the indicated merge */
if (likely(bits)) {
struct list_head *a = *tail, *b = a->prev;
a = merge(priv, cmp, b, a);
/* Install the merged result in place of the inputs */
a->prev = b->prev;
*tail = a;
}
/* Move one element from input list to pending */
list->prev = pending;
pending = list;
list = list->next;
pending->next = NULL;
count++;
} while (list);
這段程式碼分三個部份,
count
找到需要被 merge 的串列以上迴圈重複執行直到 list 內全部的節點都輸入至 pending 。迴圈執行完後,剩下就是把尚未排序的 pending list 全部排序完成,呼叫 merge_final()
。
static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head,
struct list_head *a, struct list_head *b)
{
struct list_head *tail = head;
u8 count = 0;
for (;;) {
/* if equal, take 'a' -- important for sort stability */
if (cmp(priv, a, b) <= 0) {
tail->next = a;
a->prev = tail;
tail = a;
a = a->next;
if (!a)
break;
} else {
tail->next = b;
b->prev = tail;
tail = b;
b = b->next;
if (!b) {
b = a;
break;
}
}
}
/* Finish linking remainder of list b on to tail */
tail->next = b;
do {
/*
* If the merge is highly unbalanced (e.g. the input is
* already sorted), this loop may run many iterations.
* Continue callbacks to the client even though no
* element comparison is needed, so the client's cmp()
* routine can invoke cond_resched() periodically.
*/
if (unlikely(!++count))
cmp(priv, b, b);
b->prev = tail;
tail = b;
b = b->next;
} while (b);
/* And the final links to make a circular doubly-linked list */
tail->next = head;
head->prev = tail;
}
merge_final
會也會一併把 list 恢復成 doubly-circular-linked list
。
以後不要花太多時間 Google 搜尋,回到第一手材料。
jserv
linux/list_sort.c 內的註解沒有提到太多為什麼可以加速,在 google 上也找不到類似的實作方法,最後是去翻這個檔案的 commit history,發現原先 list_sort 並不是這樣實作的,在三年前才更新成現有版本,其中 commit message 寫的非常清楚:
linux/list_sort.c
commit b5c56e0
:盡可能的將 list 的前端排序好,只要有兩個相同的長度
的 pending list ,就合併他>們,這樣只要 的大小能放進 L1 cache, 就不會有 cache miss 的問題,只有最後
幾次執行的 merge 因為 list 總長度而強迫出現 cache miss。
在 merge()
和 merge_final()
內看到兩個疑似函式,likely()
和 unlikely()
,經查詢後得知並不是函式,而是 linux 定義的巨集,定義如下:
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
其中 __builtin_expect()
是 GNU 的內建函式 (builtins), 讓編譯器知道說會預期得到什麼值,likely()
和 unlikely()
就是讓編譯器知道這邊組合語言要怎麼編排才比較不會讓分支預測失敗,減少 pipeline flush 的次數。 參考
應該先查閱 gcc 手冊,而非看 stackoverflow 的討論,畢竟你現在不具備完整的判斷能力。
jserv
了解,我會改掉這習慣。
為了將list_sort.c 引入我自己的 qtest
中,把一些 Linux kernel 才有的東西移除,像是
likely()
, unlikely()
, priv
,並自己寫一個 cmp()
比較函式。再用 ADD_COMMAND
的方法在 qtest
直接新增一個新的命令 lsort
。
使用 qtest
內提供的 time
來紀錄時間,測試用 it RAND 1000000
,先排序好, reverse 當作測試資料,所以兩個方法都會是排序一樣的資料。
以下有省略一些不需要的資訊
cmd> new
l = []
cmd> it RAND 1000000
l = [fsdzyjqxr kdqfavlq tnidjsi ezzprwg ihknkh ixhniavja xesvkqbey ynxnxwxwd hqhghid cdsltfit ubhrht qwhxemkhn dhbmdwrn kcqiq tpjcc ynbmq crktlkhn scfkib catddyxpt lbaef hxknhzlr aqhyekaks edymdkf lrwzklgkd yktqvuypf xqyvqxij hogmeswdf lvprck qqvmzlulq ojbqtamd ... ]
cmd> reverse
l = [zzzyxa zzzyumacd zzzynxon zzzylnjdo zzzwxauz zzzwtch zzzwsbo zzzwemby zzzvpbpxg zzzvf zzzutw zzzueqt zzzuccsef zzztwsmkq zzztaoq zzzsbiu zzzqpa zzzqo zzzpxxszc zzzpumxb zzzprqh zzzpm zzzpiz zzzopor zzzooqrob zzzoommd zzznph zzznjwwo zzzngxyp zzzndaqao ... ]
cmd> time sort
l = [aaaaozwb aaaazkl aaabeasj aaabhsp aaabvtlpm aaabyt aaacmf aaacueeg aaaftgjcq aaafwnrw aaagho aaagiad aaagkd aaagurvs aaagvcvl aaaiflyz aaail aaairrd aaaivfy aaakbo aaakhvvua aaakjpgm aaakzzy aaammbeto aaamo aaaoadwf aaaoglfc aaapu aaapxnka aaaqceq ... ]
Delta time = 1.421
cmd> reverse
l = [zzzyxa zzzyumacd zzzynxon zzzylnjdo zzzwxauz zzzwtch zzzwsbo zzzwemby zzzvpbpxg zzzvf zzzutw zzzueqt zzzuccsef zzztwsmkq zzztaoq zzzsbiu zzzqpa zzzqo zzzpxxszc zzzpumxb zzzprqh zzzpm zzzpiz zzzopor zzzooqrob zzzoommd zzznph zzznjwwo zzzngxyp zzzndaqao ... ]
cmd> time lsort
l = [aaaaozwb aaaazkl aaabeasj aaabhsp aaabvtlpm aaabyt aaacmf aaacueeg aaaftgjcq aaafwnrw aaagho aaagiad aaagkd aaagurvs aaagvcvl aaaiflyz aaail aaairrd aaaivfy aaakbo aaakhvvua aaakjpgm aaakzzy aaammbeto aaamo aaaoadwf aaaoglfc aaapu aaapxnka aaaqceq ... ]
Delta time = 0.907
可以看出 lsort
比我自己寫的快約 37% 。
PERF:
使用 Wikipedia - Fisher–Yates shuffle 內 modern algorithm
,每次隨機選取小於 size - i 的節點, 其中 i 為執行次數,和最後一個節點交換
void q_shuffle(struct list_head *head)
{
struct list_head *list = head, *tail = head->prev;
int size = q_size(head);
srand(time(0));
while (size > 0) {
int num = rand() % size;
struct list_head *target;
for (target = list; num >= 0; target = target->next)
num--;
if (target != tail)
list_swap(target, tail);
tail = tail->prev;
size--;
}
}
這裡使用的 list_swap
是參考 Linux Kernel API 實作的
void list_swap(struct list_head *a, struct list_head *b)
{
struct list_head *pos = b;
list_del(b);
//Replace a with b
b->next = a->next;
b->next->prev = b;
b->prev = a->prev;
b->prev->next = b;
if(pos == a)
pos = b;
list_add(a, pos);
}
在 qtest.c
init_console()
內新增
ADD_COMMAND(shuffle," | Shuffle the queue in random order");
便完成 q_shuffle
的功能擴充
cmd> show
l = [1 2 3 4 5 6]
cmd> shuffle
l = [1 6 2 3 4 5]
cmd> shuffle
l = [1 6 2 5 3 4]