contributed by < cwl0429 >
linux2022
$ gcc --version
gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
$ lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
Address sizes: 39 bits physical, 48 bits virtual
CPU(s): 8
On-line CPU(s) list: 0-7
Thread(s) per core: 2
Core(s) per socket: 4
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 142
Model name: Intel(R) Core(TM) i5-8265U CPU @ 1.60GHz
Stepping: 11
CPU MHz: 1800.000
CPU max MHz: 3900.0000
CPU min MHz: 400.0000
BogoMIPS: 3600.00
Virtualization: VT-x
L1d cache: 128 KiB
L1i cache: 128 KiB
L2 cache: 1 MiB
L3 cache: 6 MiB
NUMA node0 CPU(s): 0-7
q_new
: 建立新的「空」佇列;q_free
: 釋放佇列所佔用的記憶體;q_insert_head
: 在佇列開頭 (head) 插入 (insert) 給定的新節點 (以 LIFO 準則);q_insert_tail
: 在佇列尾端 (tail) 插入 (insert) 給定的新節點 (以 FIFO 準則);q_remove_head
: 在佇列開頭 (head) 移去 (remove) 給定的節點;q_release_element
: 釋放特定節點的記憶體空間;q_size
: 計算目前佇列的節點總量;q_delete_mid
: 移走佇列中間的節點,詳見 LeetCode 2095. Delete the Middle Node of a Linked Listq_delete_dup
: 在已經排序的狀況,移走佇列中具備重複內容的節點,詳見 LeetCode 82. Remove Duplicates from Sorted List IIq_swap
: 交換佇列中鄰近的節點,詳見 LeetCode 24. Swap Nodes in Pairsq_reverse
: 以反向順序重新排列鏈結串列,該函式不該配置或釋放任何鏈結串列節點,換言之,它只能重排已經存在的節點;q_sort
: 以遞增順序來排序鏈結串列的節點,可參閱 Linked List Sort 得知實作手法;struct list_head *q_new()
{
struct list_head *head = malloc(sizeof(struct list_head));
if(!head){
return NULL;
}
INIT_LIST_HEAD(head);
return head;
}
list.h
中的 INIT_LIST_HEAD
將 next 和 prev 指向 head 本身並且回傳 head,否則回傳 NULL 表示配置失敗void q_free(struct list_head *l)
{
if(!l){
return;
}
element_t *e, *safe;
list_for_each_entry_safe(e, safe, l, list){
q_release_element(e);
}
free(l);
}
list_for_each_entry_safe
找到 queue 中的每個 element 並釋放list_for_each_entry
,是因為在迭代的過程中會將 element 刪除,list_for_each_entry
沒有對 node 做保護,若隨意刪除節點,將會無法找到下一個 node用通順的漢語書寫。
:notes: jserv
bool q_insert_head(struct list_head *head, char *s)
{
element_t *e = malloc(sizeof(element_t));
if(!e){
return false;
}
e->value = strdup(s);
list_add(&e->list, head);
return true;
}
element_t *e
,若是無法配置則回傳 falselist_add
將 e->list 加入 queue 的開頭並回傳true if(!e || !head){
return false;
}
q_insert_head
判斷式,當 queue 尚未被建立時,不能插入節點,應回傳 falsee->value = strdup(s);
if (!e->value) {
free(e);
return false;
}
bool q_insert_tail(struct list_head *head, char *s)
{
element_t *e = malloc(sizeof(element_t));
if(!e){
return false;
}
e->value = strdup(s);
list_add_tail(&e->list, head);
return true;
}
q_insert_head
相似,唯一差別在於 e->list 加入的位置list_add_tail
將 e->list 加入 queue 的 tail if(!e || !head){
return false;
}
q_insert_head
一樣問題element_t *q_remove_head(struct list_head *head, char *sp, size_t bufsize)
{
if (!head || list_empty(head)) {
return NULL;
}
element_t *e = list_first_entry(head, element_t, list);
list_del(head->next);
if (sp) {
sp = strncpy(sp, e->value, bufsize);
}
return e;
}
list_first_entry
取得 queue 的第一個 elementlist_del
將該節點移出
list_del
只有將 link 斷開,以下為list.h
註解: * list_del() - Remove a list node from the list
* @node: pointer to the node
*
* The node is only removed from the list. Neither the memory of the removed
* node nor the memory of the entry containing the node is free'd. The node
* has to be handled like an uninitialized node. Accessing the next or prev
* pointer of the node is not safe.
*
* Unlinked, initialized nodes are also uninitialized after list_del.
*
* LIST_POISONING can be enabled during build-time to provoke an invalid memory
* access when the memory behind the next/prev pointer is used after a list_del.
* This only works on systems which prohibit access to the predefined memory
* addresses.
qtest.c
中 do_remove
會主動將 \0 填入字串的末端element_t *q_remove_tail(struct list_head *head, char *sp, size_t bufsize)
{
if (!head || list_empty(head)) {
return NULL;
}
element_t *e = list_last_entry(head, element_t, list);
list_del(head->prev);
if (sp) {
sp = strncpy(sp, e->value, bufsize);
sp[bufsize - 1] = '\0';
}
return e;
}
q_remove_head
雷同,只差在移除 head->next 還是 head->prevint q_size(struct list_head *head)
{
if (!head) {
return 0;
}
int len = 0;
struct list_head *li;
list_for_each (li, head)
len++;
return len;
}
lab0-c
作業說明,目前還沒想到更好的寫法bool q_delete_mid(struct list_head *head)
{
// https://leetcode.com/problems/delete-the-middle-node-of-a-linked-list/
if (!head || list_empty(head)) {
return NULL;
}
struct list_head *fast;
struct list_head **idir = &head->next;
for (fast = head->next; fast == head || fast->next == head; fast = fast->next->next) {
idir = &(*idir)->next;
}
element_t *e = list_entry(*idir, element_t, list);
list_del(*idir);
free(e);
return true;
}
Floyd's tortoise and hare
演算法在本課程中,「核心」一定指 OS kernel,你該避免濫用該詞彙。附上必要的參考資訊。
:notes: jserv
void q_swap(struct list_head *head)
{
// https://leetcode.com/problems/swap-nodes-in-pairs/
if (!head) {
return;
}
struct list_head *l;
for (l = head->next; (l != head) && (l->next != head) ;l = l->next) {
list_move(l, l->next);
}
}
list_move
將當前 node 插入 node->next 的 nextvoid q_reverse(struct list_head *head)
{
if (!head) {
return;
}
struct list_head *l, *safe;
list_for_each_safe (l, safe, head) {
l->next = l->prev;
l->prev = safe;
}
l = head->next;
head->next = head->prev;
head->prev = l;
}
list_for_each_safe
list_for_each_safe (l, safe, head) {
list_move(l, head);
}
struct list_head *mergeTwoLists(struct list_head *l1, struct list_head *l2)
{
struct list_head *head = NULL, **ptr = &head, **node = NULL;
for (node = NULL; l1 && l2; *node = (*node)->next) {
element_t *e1 = list_entry(l1, element_t, list);
element_t *e2 = list_entry(l2, element_t, list);
node = strcmp(e1->value,e2->value) < 0 ? &l1: &l2;
*ptr = *node;
ptr = &(*ptr)->next;
}
*ptr = (struct list_head *) ((uintptr_t) l1 | (uintptr_t) l2);
return head;
}
struct list_head *mergesort(struct list_head *head)
{
if (!head || !head->next) {
return head;
}
struct list_head *fast, *slow;
slow = head;
for (fast = head->next; fast && fast->next;
fast = fast->next->next) {
slow = slow->next;
}
struct list_head *left, *right;
right = slow->next;
slow->next = NULL;
left = mergesort(head);
right = mergesort(right);
return mergeTwoLists(left, right);
}
/*
* Sort elements of queue in ascending order
* No effect if q is NULL or empty. In addition, if q has only one
* element, do nothing.
*/
void q_sort(struct list_head *head)
{
if ((!head || list_empty(head)) || list_is_singular(head)) {
return;
}
head->prev->next = NULL;
head->next = mergesort(head->next);
struct list_head *l, *prev = head;
for (l = head->next; l->next != NULL; l = l->next) {
l->prev = prev;
prev = l;
}
l->next = head;
head->prev = l;
}
ERROR: Computed queue size as 9973, but correct value is 10000
q_reverse
沒有寫好,導致有 link 連接錯誤,想說試試 list_move
的方式實作,便成功通過測資q_size
改寫成反方向 traverse 後,才發現 merge 結束時 queue size 已經不對,代表 q_sort
只有正確地連接順向的 nodel
的 l->prev
沒接到struct list_head *l, *prev = head;
for (l = head->next; l->next != NULL; l = l->next) {
l->prev = prev;
prev = l;
}
l->prev = prev;
l->next = head;
head->prev = l;
q_reverse
實作方式會通過測資的原因在於 list_move
串接了遺漏的 link,使 queue 在經過 reverse 後能正確地 free__strcmp_avx2 () at ../sysdeps/x86_64/multiarch/strcmp-avx2.S:101
101 ../sysdeps/x86_64/multiarch/strcmp-avx2.S: No such file or directory.
根據 Scottk 網友在 stackoverflow 的說明,似乎是 strcmp 會建立一個 assembler file - strcmp-avx2.S,但由於傳進去的 pointer 為 NULL pointer,導致它無法創建該檔案,但可能要找到更多證據來支持這項論述
請愛用 GDB 排除問題。
:notes: jserv
bool q_delete_dup(struct list_head *head)
{
// https://leetcode.com/problems/remove-duplicates-from-sorted-list-ii/
if (!head) {
return false;
}
element_t *e, *safe;
list_for_each_entry_safe(e, safe, head, list) {
if (e->list.next == head) {
break;
}
if (strcmp(e->value, safe->value) == 0) {
list_del(&e->list);
q_release_element(e);
}
}
return true;
}
strcmp
不能接受 NULL 值,所以要使用一段判斷式跳脫迴圈,避免 safe->value = NULL 傳入 strcmp
__attribute__
用途根據 gcc 的說明,__attribute__
能指定變數、函式參數及結構等等的屬性
The keyword
__attribute__
allows you to specify special properties of variables, function parameters, or structure, union, and, in C++, class members. This__attribute__
keyword is followed by an attribute specification enclosed in double parentheses. Some attributes are currently defined generically for variables. Other attributes are defined for variables on particular target systems. Other attributes are available for functions (see Function Attributes), labels (see Label Attributes), enumerators (see Enumerator Attributes), statements (see Statement Attributes), and for types (see Type Attributes). Other front ends might define more attributes (see Extensions to the C++ Language).
其中在 list_sort.c 用到 function attribute,其能幫助編譯器最佳化及更正確地檢查 code 是否出錯
In GNU C and C++, you can use function attributes to specify certain function properties that may help the compiler optimize calls or check code more carefully for correctness. For example, you can use attributes to specify that a function never returns (noreturn), returns a value depending only on the values of its arguments (const), or has printf-style arguments (format).
譬如 list_sort.c 用到不少 nonnull attribute,能指定函式的哪些參數不能為 NULL pointers,以下方例子來說, nonnull(2,3,4)
函式第二到四的參數不能為 NULL pointers,也就是 cmp, a, b 這三個參數
__attribute__((nonnull(2,3,4)))
static struct list_head *merge(void *priv,
list_cmp_func_t cmp,struct list_head *a, struct list_head *b)
Merge 方式
list_sort.c 使用了兩種 merge 函式
struct list_head *head, **tail = &head;
宣告兩個變數 * head
及 ** tail
digraph ele_list {
rankdir=LR;
node[shape=record];
head [shape=plaintext label="head"]
tail [shape=plaintext label="tail"]
NULL [shape=plaintext]
tail -> head
head -> NULL
}
利用 cmp 比較 a b 大小,若是小於或相等則選擇 a 為下一個 node,用來確保 sort stability
if (cmp(priv, a, b) <= 0)
將 a 放入 tail 指向的地址,也就是 head 的地址
1. *tail = a;
digraph ele_list {
rankdir=LR;
node[shape=record];
head [label="head(a)"]
tail [shape=plaintext label="tail"]
tail -> head
head -> a_next
}
將 tail 指向的地址改為 &a->next
2. tail = &a->next;
digraph ele_list {
rankdir=LR;
node[shape=record];
head [label="head(a)"]
a_next [label="a_next"]
tail [shape=plaintext label="tail"]
tail -> a_next
head -> a_next
}
將 a 指到 a->next
3. a = a->next;
digraph ele_list {
rankdir=LR;
node[shape=record];
head [label="head"]
a [label="a"]
tail [shape=plaintext label="tail"]
a_next [shape=plaintext]
tail -> a
head -> a
a -> a_next
}
接著重複此過程,不斷地將 a or b 串接起來
1. *tail = a;
digraph ele_list {
rankdir=LR;
node[shape=record];
head [label="head"]
a [label="a"]
tail [label="tail"]
node_2 [label="..."]
head -> node_1
node_1 -> node_2
node_2 -> node_n
node_n -> a
tail -> a
a -> a_next
}
2. tail = &a->next;
digraph ele_list {
rankdir=LR;
node[shape=record];
head [label="head"]
a [label="a"]
tail [label="tail"]
node_2 [label="..."]
head -> node_1
node_1 -> node_2
node_2 -> node_n
node_n -> a
a -> a_next
tail -> a_next
}
3. a = a->next;
digraph ele_list {
rankdir=LR;
node[shape=record];
head [label="head"]
a [label="a"]
tail [label="tail"]
node_2 [label="..."]
head -> node_1
node_1 -> node_2
node_2 -> node_n
node_n -> previous_a
previous_a -> a
tail -> a
}
直到 a or b 為 NULL,便將非 NULL 的 list 串到 *tail 指到的地址
if (!a) {
*tail = b;
break;
}
digraph ele_list {
rankdir=LR;
node[shape=record];
head [label="head"]
tail [label="tail"]
node_2 [label="..."]
a [shape=plaintext label="a_NULL"]
b -> b_1
head -> node_1
node_1 -> node_2
node_2 -> node_n
node_n -> a
tail -> a
}
在操作之後
digraph ele_list {
rankdir=LR;
node[shape=record];
head [label="head"]
tail [label="tail"]
node_2 [label="..."]
head -> node_1
node_1 -> node_2
node_2 -> node_n
node_n ->b
tail -> b
b -> b_next
}
cmp(priv, a, b) > 0
,則將上列操作 a, b 對調即可和 merge 的差異為,merge_final 會將所有的 list->prev 串接至正確的 node
struct list_head *tail = head;
u8 count = 0;
for (;;) {
/* if equal, take 'a' -- important for sort stability */
if (cmp(priv, a, b) <= 0) {
tail->next = a;
a->prev = tail;
tail = a;
a = a->next;
if (!a)
break;
} else {
tail->next = b;
b->prev = tail;
tail = b;
b = b->next;
if (!b) {
b = a;
break;
}
}
}
將 tail->next 串接至 b 後,再把後續的 b->prev 串接完成
tail->next = b;
do {
if (unlikely(!++count))
cmp(priv, b, b);
b->prev = tail;
tail = b;
b = b->next;
} while (b);
需要注意的地方為以下部分:
if (unlikely(!++count))
cmp(priv, b, b);
!++count
: 由於 count 為 u_int8,代表在 count 遞增至 255 前,傳入 unlikely
的數值皆為 0
!++count
為 1unlikely
是一個巨集,定義在 compiler.h
,定義內容如下:
# define unlikely(x) __builtin_expect(!!(x), 0)
!!(x)
用途是使任何變數在兩次 not 後,剩下 0 或 1 兩種可能unlikely
的回傳值為實際上 !!(!++count)
的結果unlikely
會影響編譯器的 branch prediction,表示在該 if 條件下的 code,較不常被執行,進而將此段 code 擺到較為後方的位置至於為何需要比較相同的數值呢 ? 根據 merge_final
內的註解 :
/*
* If the merge is highly unbalanced (e.g. the input is
* already sorted), this loop may run many iterations.
* Continue callbacks to the client even though no
* element comparison is needed, so the client's cmp()
* routine can invoke cond_resched() periodically.
*/
cmp(priv, b, b)
是為了定期喚醒 cond_resched
,使得 cpu 的資源能被短暫釋放
但為何呼叫 cmp
能連帶觸發 cond_resched
,目前還沒搞懂
list_sort
首先來看 list_sort
的註解
* This mergesort is as eager as possible while always performing at least 2:1 balanced merges.
得知此 mergesort 的實作方式,是希望被 merge 的 sublists size 比例至少為 2 : 1 (較大的為較小的兩倍以上)
* Given two pending sublists of size 2^k, they are merged to a size-2^(k+1) list as soon as we have 2^k following elements.
* Thus, it will avoid cache thrashing as long as 32^k elements can fit into the cache. Not quite as good as a fully-eager bottom-up mergesort, but it does use 0.2n fewer comparisons, so is faster in the common case that everything fits into L1.
常見的 merge sort 會在遇到兩個大小為 \(2^k\) 的 list 時將其 merge,但 linux 的 merge sort 只有在兩個大小為 \(2^k\) 的 list 位於 pending 且再遇到一個 \(2^k\) 的 list 時,才會將 pending 中的兩個 \(2^k\) 的 list merge。若是此 \(3*2^k\) 的 list 能放入 cache,則可避免 cache thrashing 發生
* The merging is controlled by "count", the number of elements in the pending lists. This is beautifully simple code, but rather subtle.
* Each time we increment "count", we set one bit (bit k) and clear bits k-1 .. 0. Each time this happens (except the very first time for each bit, when count increments to 2^k), we merge two lists of size 2^k into one list of size 2^(k+1).
* This merge happens exactly when the count reaches an odd multiple of 2^k, which is when we have 2^k elements pending in smaller lists, so it's safe to merge away two lists of size 2^k.
* After this happens twice, we have created two lists of size 2^(k+1), which will be merged into a list of size 2^(k+2) before we create a third list of size 2^(k+1), so there are never more than two pending.
* The number of pending lists of size 2^k is determined by the state of bit k of "count" plus two extra pieces of information:
- The state of bit k-1 (when k == 0, consider bit -1 always set), and Whether the higher-order bits are zero or non-zero (i.e.is count >= 2^(k+1)).
裡面提到 count 是控制放入 pending 的 list 數量,會根據當前 count 而有不同效果,共有六種狀態 :
* There are six states we distinguish. "x" represents some arbitrary
* bits, and "y" represents some arbitrary non-zero bits:* 0: 00x: 0 pending of size 2^k; x pending of sizes < 2^k
* 1: 01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
* 2: x10x: 0 pending of size 2^k; 2^k + x pending of sizes < 2^k
* 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
* 4: y00x: 1 pending of size 2^k; 2^k + x pending of sizes < 2^k
* 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
* (merge and loop back to state 2)
接著來看實際程式碼
struct list_head *list = head->next, *pending = NULL;
size_t count = 0; /* Count of pending */
if (list == head->prev) /* Zero or one elements */
return;
/* Convert to a null-terminated singly-linked list. */
head->prev->next = NULL;
/* Find the least-significant clear bit in count */
for (bits = count; bits & 1; bits >>= 1)
tail = &(*tail)->prev;
/* Do the indicated merge */
if (likely(bits)) {
struct list_head *a = *tail, *b = a->prev;
a = merge(priv, cmp, b, a);
/* Install the merged result in place of the inputs */
a->prev = b->prev;
*tail = a;
}
for (bits = count; bits & 1; bits >>= 1)
likely(bits)
和 merge_final
中提到的 unlikely
相似,都能藉由此巨集幫助編譯器最佳化 /* Move one element from input list to pending */
list->prev = pending;
pending = list;
list = list->next;
pending->next = NULL;
count++;
Risheng1128
所作的研讀 Linux 核心原始程式碼 list_sort建立完 pending 後,此時 pending 內是由小到大排列完成的 sublists,剩下就是把 pending 內的所有 sublists 進行合併
for (;;) {
struct list_head *next = pending->prev;
if (!next)
break;
list = merge(priv, cmp, pending, list);
pending = next;
}
最後將 list 內的 prev links 串起,便完成此 sort
/* The final merge, rebuilding prev links */
merge_final(priv, cmp, head, pending, list);
list_sort
引入將 list_sort.c
及 list_sort.h
放入專案中,並將遺失定義的巨集補進 list_sort.h
內
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#include <stdint.h>
typedef uint8_t u8;
struct list_head;
typedef int
__attribute__((nonnull(2, 3))) (*list_cmp_func_t)(void *,
const struct list_head *,
const struct list_head *);
新增 cmd 的 option : list_sort
Options:
echo 1 Do/don't echo commands
error 5 Number of errors until exit
fail 30 Number of times allow queue operations to return false
length 1024 Maximum length of displayed string
list_sort 1 Use linux sort or not.
malloc 0 Malloc failure probability percent
用來決定要使用 linux 的 sort 或 自己實作的 sort
先設計一個自動測試檔 trace-18-compare.cmd
# Compare linux sort and my merge sorting effiency.
option echo 0
option verbose 1
# linux sort
new
ih RAND 262144
time sort
free
new
ih RAND 262144
time sort
free
new
ih RAND 262144
time sort
free
new
ih RAND 262144
time sort
free
new
ih RAND 262144
time sort
free
# my sort
option list_sort 0
new
ih RAND 262144
time sort
free
new
ih RAND 262144
time sort
free
new
ih RAND 262144
time sort
free
new
ih RAND 262144
time sort
free
new
ih RAND 262144
time sort
free
使用大小為 \(262144 (2^{18})\) 的資料進行排序
結果如下:
# linux sort
Delta time = 0.155
Delta time = 0.286
Delta time = 0.328
Delta time = 0.313
Delta time = 0.310
# my sort
Delta time = 0.601
Delta time = 0.585
Delta time = 0.586
Delta time = 0.581
Delta time = 0.577
可以發現 my sort 的 delta time 約為 linux sort 的 2.1 倍
接著嘗試利用 perf 進行 mergesort 效能分析
我將兩種 sort 各跑 5 次,perf 會將結果作平均
linux sort
Performance counter stats for './qtest -f traces/trace-18-compare.cmd' :
143,081,730 cache-misses # 59.404 % of all cache refs ( +- 0.67% )
240,861,121 cache-references ( +- 0.40% )
3,262,443,609 instructions # 0.65 insn per cycle ( +- 0.01% )
5,017,161,434 cycles ( +- 0.72% )
2.3186 +- 0.0291 seconds time elapsed ( +- 1.26% )
my sort
Performance counter stats for './qtest -f traces/trace-18-compare.cmd' :
132,646,439 cache-misses # 54.123 % of all cache refs ( +- 0.13% )
245,083,976 cache-references ( +- 0.56% )
3,263,356,237 instructions # 0.42 insn per cycle ( +- 0.00% )
7,717,237,980 cycles ( +- 0.22% )
3.3432 +- 0.0180 seconds time elapsed ( +- 0.54% )
list_sort() |
q_sort() |
|
---|---|---|
cache-misses | 143,081,730 | 132,646,439 |
cache-references | 240,861,121 | 245,083,976 |
instructions | 3,262,443,609 | 3,263,356,237 |
cycles | 5,017,161,434 | 7,717,237,980 |
我發現 cache-misses, cache-references 及 instruction,兩者的表現接近,但在比較 cycles 時,my sort 比起多了 27 億個 cycles,差異不小
看懂 Fisher-Yates shuffle,想法如下:
將 shuffle 加入 console_int()
內
ADD_COMMAND(shuffle," | Shuffle every nodes in queue");
便能利用 shuffle 命令將 queue 隨機洗混,操作如下:
cmd> new
l = []
cmd> ih RAND 10
l = [uawvu iricgbk jvordyvfs tqbmrqm ahntsn gkcct tanygz jlaslaylm vcxvuj tqnffm]
cmd> shuffle
l = [vcxvuj uawvu jvordyvfs jlaslaylm tqnffm tanygz gkcct tqbmrqm ahntsn iricgbk]