2022q1 Homework1 (lab0)

contributed by < Xx-oX >

實驗環境

$ gcc --version
gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0

$ lscpu
Architecture:                    x86_64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
Address sizes:                   43 bits physical, 48 bits virtual
CPU(s):                          8
On-line CPU(s) list:             0-7
Thread(s) per core:              2
Core(s) per socket:              4
Socket(s):                       1
NUMA node(s):                    1
Vendor ID:                       AuthenticAMD
CPU family:                      23
Model:                           17
Model name:                      AMD Ryzen 7 2700U with Radeon Vega Mobile Gfx
Stepping:                        0
Frequency boost:                 enabled
CPU MHz:                         1700.000
CPU max MHz:                     2200.0000
CPU min MHz:                     1600.0000
BogoMIPS:                        4391.49
Virtualization:                  AMD-V
L1d cache:                       128 KiB
L1i cache:                       256 KiB
L2 cache:                        2 MiB
L3 cache:                        4 MiB
NUMA node0 CPU(s):               0-7

開發過程 - queue.c

q_new

運用 INIT_LIST_HEAD，將 head 的 next 跟 prev 指向自己

struct list_head *q_new()
{
    struct list_head *head = malloc(sizeof(struct list_head));

    if (!head)
        return NULL;

    INIT_LIST_HEAD(head);

    return head;
}

q_free

利用 container_of 找到目前 list_head 指向的 element_t，並釋放空間

for 迴圈的部份之後會改用 for_each 巨集處理

改成使用 list_for_each_entry_safe 巨集，以精簡程式碼

void q_free(struct list_head *l)
{
    if (!l)
        return;

    element_t *entry, *safe;

    list_for_each_entry_safe (entry, safe, l, list)
        q_release_element(entry);

    free(l);
}

q_insert_head

分開檢查 head 以避免不必要的 malloc
strlen(s) + 1 是為了結尾 '\0' 的空間
將 strlen(s) + 1 存起來，避免重複運算，以通過 trace-17 (見後面 Make test 段落)
最後利用 list_add 來加入佇列





















bool q_insert_head(struct list_head *head, char *s)
{
    if (!head)
        return false;

    element_t *el = malloc(sizeof(element_t));
    if (!el)
        return false;

    size_t len = strlen(s) + 1;
    el->value = malloc(sizeof(char) * len);
    if (!el->value) {
        free(el);
        return false;
    }
    memcpy(el->value, s, len);

    list_add(&el->list, head);

    return true;
}

在第12行 q_release_element(el) 中，
不確定 free(el->value) 的必要性 => 我認為是多餘的，理由如下

查詢 manpage：
The free() function frees the memory space pointed to by ptr, which must have been returned by a previous call to malloc(), calloc(), or realloc().
Otherwise, or if free(ptr) has already been called before, undefined behavior occurs.
If ptr is NULL, no operation is performed.

會執行這行表示 el->value 的值為 NULL，根據上面敘述，不會執行任何動作

TODO: 使用記憶體分析工具，搭配實驗來探討 jserv

實驗

設計兩個程式 test1.c test2.c

#include <stdlib.h>

struct space {
    char padding[16];
    char *var;
};

int main()
{
    struct space *s = malloc(sizeof(struct space));
    
    s->var = NULL;

    /*test 1*/
    free(s);

    /*test 2*/
    free(s->var);
    free(s);
    
    return 0;
}

執行 gcc -S [src] 查看組合語言發現

~/tmp » diff test1.s test2.s
1c1
< 	.file	"test1.c"
---
> 	.file	"test2.c"
19a20,23
> 	movq	-8(%rbp), %rax
> 	movq	16(%rax), %rax
> 	movq	%rax, %rdi
> 	call	free@PLT

多了一段 subroutine call

但若加入 -O1 參數執行編譯器最佳化 gcc -O1 -S [src] 之後則兩者沒有差別

~/tmp » diff test1.s test2.s
1c1
< 	.file	"test1.c"
---
> 	.file	"test2.c"

而 -O1 是本次作業中採用的最佳化參數，由此可見有沒有 free(s->var) 在 s->var == NULL 時沒有區別，跟設想中的一樣。

記憶體分析：

結合 test1 (-> fun_a), test2 (-> fun_b) 再加入兩個對照組 (fun_c, fun_d)
依照順序分時執行
(沒有使用編譯器最佳化)

#include <stdlib.h>
#include <string.h>
#include <unistd.h>

struct space {
    char padding[12];
    char *var;
};

void fun_a()
{
    struct space *s = malloc(sizeof(struct space));
    s->var = NULL;
    sleep(1);
    free(s->var);
    free(s);
    sleep(1);
}

void fun_b()
{
    struct space *s = malloc(sizeof(struct space));
    s->var = NULL;
    sleep(1);
    free(s);
    sleep(1);   
}

void fun_c()
{
    struct space *s = malloc(sizeof(struct space));
    s->var = strdup("ccc");
    sleep(1);
    free(s->var);
    free(s);
    sleep(1);
}

void fun_d()
{
    struct space *s = malloc(sizeof(struct space));
    s->var = strdup("ddd");
    sleep(1);
    free(s);
    sleep(1);
}

int main(int argc, char *argv[])
{
    fun_a(); 
    fun_b();
    fun_c();
    fun_d();
    return 0;
}

使用 valgrind-massif 分析 heap 的使用情形

並參考去年 jasonmatobo 同學的作法，
用 massif-visuallizer 呈現結果，得到下圖

可以看出 fun_a, fun_b 對於 heap 的使用是一樣的，由此佐證前述之猜測。
(其中 fun_d 會導致 memory-leak，因為沒有將 strdup 產生出的空間釋出)

q_insert_tail

與 q_insert_head 相似，改用 list_add_tail 來加入佇列的尾端

bool q_insert_tail(struct list_head *head, char *s)
{
    if (!head)
        return false;

    element_t *el = malloc(sizeof(element_t));
    if (!el)
        return false;

    size_t len = strlen(s) + 1;
    el->value = malloc(sizeof(char) * len);
    if (!el->value) {
        free(el);
        return false;
    }
    memcpy(el->value, s, len);

    list_add_tail(&el->list, head);

    return true;
}

q_remove_head

斷開 head 連結，並將 element 中的內容複製到 sp 中
strlen 不會計算 null terminator

The strlen() function calculates the length of the string pointed to by s, excluding the terminating null byte ('\0').

element_t *q_remove_head(struct list_head *head, char *sp, size_t bufsize)
{
    if (!head || !sp)
        return NULL;

    struct list_head *node = head->next;
    if (node == head)
        return NULL;

    element_t *el = list_entry(node, element_t, list);
    size_t len = strnlen(el->value, bufsize - 1);
    memcpy(sp, el->value, len);
    *(sp + len * sizeof(char)) = '\0';

    list_del(node);

    return el;
}

新增巨集 min

參考自 minmax
#define min(x, y) (x) < (y) ? (x) : (y)

後來使用 strnlen 取代 min 的功能

q_remove_tail

跟 q_remove_head 相似，改為移除 tail

element_t *q_remove_tail(struct list_head *head, char *sp, size_t bufsize)
{
    if (!head || !sp)
        return NULL;

    struct list_head *node = head->prev;
    if (node == head)
        return NULL;

    element_t *el = list_entry(node, element_t, list);
    size_t len = strnlen(el->value, bufsize - 1);
    memcpy(sp, el->value, len);
    *(sp + len * sizeof(char)) = '\0';

    list_del(node);

    return el;
}

q_release_element

沒有更動

void q_release_element(element_t *e)
{
    free(e->value);
    free(e);
}

q_size

照抄自作業要求中的牛刀小試

int q_size(struct list_head *head)
{
    if (!head)
        return 0;

    int len = 0;
    struct list_head *pos;
    list_for_each (pos, head)
        len++;
    return len;
}

q_delete_mid

從兩邊夾擠找到中間節點，利用 index 來判斷位置
新增巨集 del_node 來刪除節點

#define del_node(node)                                       \
    {                                                        \
        element_t *el = list_entry(node, element_t, list);   \
        list_del(node);                                      \
        q_release_element(el);                               \
    }

bool q_delete_mid(struct list_head *head)
{
    // https://leetcode.com/problems/delete-the-middle-node-of-a-linked-list/

    if (!head || list_empty(head))
        return false;

    // squeeze to find the middle-node
    struct list_head *p = head->next;
    int hindex = 0, tindex = q_size(head) - 1;  // O(n)

    for (; hindex < tindex; hindex++, tindex--, p = p->next)
        ;

    /*
     * hp: go from head to tail
     * tp: go from tail to head
     * odd nodes: head -> ... -> hp == tp -> ...
     * even nodes: head -> ... -> tp -> hp -> ...
     * thus delete "hp"
     * => simplify to one pointer p
     */
    del_node(p);  // delete middle node

    return true;
}

q_delete_dup

利用雙重迴圈來查找重複的節點
新增巨集 get_value，縮短程式碼

#define get_value(node) list_entry(node, element_t, list)->value

bool q_delete_dup(struct list_head *head)
{
    // https://leetcode.com/problems/remove-duplicates-from-sorted-list-ii/

    if (!head)
        return false;

    if (list_empty(head))
        return true;

    struct list_head *p = head->next;

    for (; p != head && p->next != head;) {
        if (strcmp(get_value(p), get_value(p->next)) == 0) {
            struct list_head *np = p->next;
            char *str = strdup(get_value(np));
            for (; np->next != head && strcmp(str, get_value(np->next)) == 0;) {
                np = np->next;
                del_node(np->prev);
            }
            del_node(np);
            free(str);  // release momery allocated by strdup
            p = p->next;
            del_node(p->prev);
        } else {
            p = p->next;
        }
    }

    return true;
}

q_swap

節點交換之後， p 的 next 也改為指向本來的下下個節點，所以直接遍歷即可
新增內部函式 swap_node，交換任意兩個節點（相鄰 / 不相鄰）

詳細討論

a, b 為要交換的節點
相鄰的情形
從 … -> m -> a -> b -> n -> …
移除 a, b => … -> m -> n -> …
最後加回 a, b => … -> m -> b -> a -> n -> …
不相鄰的情形
從 … -> m -> a -> c -> … -> d -> b -> n -> …
移除 a, b => … -> m -> c -> … -> d -> n -> …, a, b
最後加回 a, b => … -> m -> b -> c -> … -> d -> a -> n -> …

static inline void swap_node(struct list_head *a, struct list_head *b)
{
    struct list_head *m = a->prev;
    struct list_head *n = b->next;
    /*remove a*/
    m->next = a->next;
    a->next->prev = m;
    /*remove b*/
    b->prev->next = n;
    n->prev = b->prev;
    /*insert b*/
    m->next->prev = b;
    b->next = m->next;
    m->next = b;
    b->prev = m;
    /*insert a*/
    n->prev->next = a;
    a->prev = n->prev;
    n->prev = a;
    a->next = n;
}

寫成 static inline 避免額外的 branch ，類似巨集展開的方式

寫成巨集的形式會無法正常執行，尚在找出問題點（或許跟編譯器最佳化有關）

#define swap_node(a, b)                \
   {                                  \
       struct list_head *m = a->prev; \
       struct list_head *n = b->next; \
       /*remove a*/                   \
       m->next = a->next;             \
       a->next->prev = m;             \
       /*remove b*/                   \
       b->prev->next = n;             \
       n->prev = b->prev;             \
       /*insert b*/                   \
       m->next->prev = b;             \
       b->next = m->next;             \
       m->next = b;                   \
       b->prev = m;                   \
       /*insert a*/                   \
       n->prev->next = a;             \
       a->prev = n->prev;             \
       n->prev = a;                   \
       a->next = n;                   \
   }

void q_swap(struct list_head *head)
{
    // https://leetcode.com/problems/swap-nodes-in-pairs/

    if (!head || list_empty(head))
        return;

    struct list_head *p = head->next;

    for (; p != head && p->next != head; p = p->next) {
        swap_node_adjacent(p, p->next);
    }
}

q_reverse

與 q_delete_mid 相似，從兩端夾擠並利用 swap_node 逐步交換

void q_reverse(struct list_head *head)
{
    if (!head || list_empty(head))
        return;

    // hp: head pointer, tp: tail pointer
    struct list_head *hp = head->next;
    struct list_head *tp = head->prev;
    int hindex = 0, tindex = q_size(head) - 1;  // O(n)

    for (; hindex < tindex; hindex++, tindex--) {
        struct list_head *a = hp;
        struct list_head *b = tp;
        hp = hp->next;
        tp = tp->prev;
        swap_node(a, b);
    }
}

q_sort

使用遞迴形式的 merge sort 來實作
分成兩個函式：拆兩半用的 merge_sort 以及合併用的 merge

merge_sort

利用 Floyd's Algorithm 找到中央節點 p，並分成兩個 circular doubly-linked list (以 h 為首以及以 p為首)

此處的 h 是佇列中的第一個元素，而不是利用 q_new 得到的 head

因為無法配置記憶體的限制（qtest.c 中第 674 行 set_noallocate_mode(true)），故將原本的 head 移除，直接以佇列中第一個元素取代

將兩段佇列遞迴呼叫 merge_sort，再進行合併

struct list_head *merge_sort(struct list_head *h)
{
    // single
    if (h->next == h)
        return h;

    // Floyd's Algorithm, a.k.a. Tortoise and Hare Algorithm
    struct list_head *hare = h->next, *tortoise = h->next;
    for (; hare->next != h && hare->next->next != h;
         tortoise = tortoise->next, hare = hare->next->next)
        ;

    struct list_head *p = tortoise;

    /*
     * Split into two circular-doubly-linked list.
     * before: -> h -> ... -> ht -> p -> ... -> pt ->
     * result: -> h -> ... -> ht -> , -> p -> ... -> pt ->
     */
    struct list_head *ht = p->prev;
    struct list_head *pt = h->prev;
    ht->next = h;
    h->prev = ht;
    pt->next = p;
    p->prev = pt;

    // recurision
    h = merge_sort(h);
    p = merge_sort(p);

    // merge into one list
    return merge(h, p);
}

merge

先判斷是否為不平衡的情形，如果是則直接將兩個佇列相接 (參見下方 Make test 段落)
參考 list_splice 新增函式 splice 來串接兩個被移除 head 的佇列

static inline void splice(struct list_head *a, struct list_head *b)
{
    struct list_head *a_tail = a->prev;
    struct list_head *b_tail = b->prev;

    a_tail->next = b;
    b->prev = a_tail;
    b_tail->next = a;
    a->prev = b_tail;
}

將兩個佇列先攤平（ flatten ）以便進行迴圈條件判斷

攤平亦即拆開頭尾的連接部份，使其不再循環

此處參考你所不知道的 C 語言：linked list 和非連續記憶體中「從 Linux 核心的藝術談起」下的案例探討，使用 pointer to pointer 的技巧來省去一次條件判斷

即 struct list_head **cmp ，用來移動 rp 跟 lp 中對應元素數值較小者

第二次迴圈用來將剩下的元素填入

舉例說明：合併 r = [1, 2, 3] 與 l = [4, 5, 6] 時，
r 中的元素會先被用盡（ rp == NULL ），而 l 卻還是滿的（ lp == l ），此時就需要第二個迴圈來將 l 中的元素填入

struct list_head *merge(struct list_head *r, struct list_head *l)
{
    // for unbalanced case like 1, ..., 1 vs 2, ..., 2
    if (strcmp(get_value(r->prev), get_value(l)) < 0 ||
        strcmp(get_value(r->prev), get_value(l)) == 0) {
        splice(r, l);
        return r;
    } else if (strcmp(get_value(l->prev), get_value(r)) < 0) {
        splice(l, r);
        return l;
    }
    
    /*
     * flatten
     * NULL -> r -> ... -> rt -> NULL
     * NULL -> l -> ... -> lt -> NULL
     */
    r->prev->next = NULL;  // rt->next = NULL
    r->prev = NULL;
    l->prev->next = NULL;  // lt->next = NULL
    l->prev = NULL;

    // decide head
    struct list_head *rp = r, *lp = l;
    struct list_head **cmp;
    cmp = (strcmp(get_value(rp), get_value(lp)) < 0) ? &rp : &lp;
    struct list_head *h = *cmp;
    struct list_head *tail = h;
    *cmp = (*cmp)->next;

    for (cmp = NULL; rp && lp; *cmp = (*cmp)->next, tail = tail->next) {
        cmp = (strcmp(get_value(rp), get_value(lp)) < 0) ? &rp : &lp;
        tail->next = *cmp;
        (*cmp)->prev = tail;
    }

    // connect the last(rest) node
    cmp = rp ? &rp : &lp;
    for (; *cmp; *cmp = (*cmp)->next, tail = tail->next) {
        tail->next = *cmp;
        tail->next->prev = tail;
    }

    // circulate the list
    tail->next = h;
    h->prev = tail;

    return h;
}

最後將 merge_sort 的結果接回 head 即完成

void q_sort(struct list_head *head)
{
    if (!head || list_empty(head) || list_is_singular(head))
        return;

    // dehead, and remake the circle
    struct list_head *p = head->next;
    p->prev = head->prev;
    head->prev->next = p;

    p = merge_sort(p);

    // conect result p to head
    head->next = p;
    head->prev = p->prev;
    p->prev->next = head;
    p->prev = head;
}

更動紀錄

原始碼中使用 #define MERGE_SORT_DEBUG 搭配 #ifdef MERGE_SORT_DEBUG, #endif 來顯示除錯訊息

將原本比較第一個字元改成使用 strcmp 來比較第一個字串 => 通過 make test

// before
cmp = (get_value(rp)[0] < get_value(lp)[0]) ? &rp : &lp;
// after
cmp = (strcmp(get_value(rp), get_value(lp)) < 0) ? &rp : &lp;

將找尋中間節點的方法改成 Floyd's Algorithm

// before
int hindex = 0, tindex = 0;
   struct list_head *p;
   for (p = h->next; p != h; tindex++, p = p->next)
       ;
   // find middle node
   p = h;
   for (; hindex < tindex; hindex++, tindex--, p = p->next)
       ;
// after
struct list_head *hare = h->next, *tortoise = h->next;
   for (; hare->next != h && hare->next->next != h;
        tortoise = tortoise->next, hare = hare->next->next)
       ;
   struct list_head *p = tortoise;

但依舊過不了 trace-14

Make Test

本地端 make test 可以全部通過，但是在 github action 中無法通過 trace-14 跟 trace-17

其中 trace-14 是因為排序的複雜度過高
首先改用 Floyd's Algorithm 來改良 q_sort 找尋中間節點的方法 => 但依然無法通過

後來觀察 trace/trace-14-perf.cmd

# Test performance of insert_tail, reverse, and sort
option fail 0
option malloc 0
new
ih dolphin 1000000
it gerbil 1000000
reverse
sort

發現要對兩個極度不平衡的佇列作排序，於是針對這樣的情形作優化 => 成功通過！

此處的不平衡指的是如 [1, 1, 2, 2] 與 [3, 3, 3, 3]
依照原先的方法會遍歷兩個佇列，但其實可以直接將後者接上前者
=> [1, 1, 2, 2, 3, 3, 3, 3]
會大幅降低這類情形的複雜度

而 trace-17 則是因為多次計算 strlen ，導致在 insert_head 時超出常數時間。
因此只要先將 strlen 的結果存起來便可解決。 (已通過)

研讀 Linux 核心原始程式碼的 lib/list_sort.c

研讀程式碼以及參考 freshLiver ， kdnvt ， arthur-chang 等同學的筆記

兩段重要註解

This mergesort is as eager as possible while always performing at least 2:1 balanced merges. Given two pending sublists of size 2^k, they are merged to a size-2^(k+1) list as soon as we have 2^k following elements.
Thus, it will avoid cache thrashing as long as 32^k elements can fit into the cache. Not quite as good as a fully-eager bottom-up mergesort, but it does use 0.2n fewer comparisons, so is faster in the common case that everything fits into L1.
The merging is controlled by "count", the number of elements in the pending lists. This is beautifully simple code, but rather subtle.
Each time we increment "count", we set one bit (bit k) and clear bits k-1 .. 0. Each time this happens (except the very first time for each bit, when count increments to 2^k), we merge two lists of size 2^k into one list of size 2^(k+1).
This merge happens exactly when the count reaches an odd multiple of 2^k, which is when we have 2^k elements pending in smaller lists, so it's safe to merge away two lists of size 2^k.
After this happens twice, we have created two lists of size 2^(k+1), which will be merged into a list of size 2^(k+2) before we create a third list of size 2^(k+1), so there are never more than two pending.
The number of pending lists of size 2^k is determined by the state of bit k of "count" plus two extra pieces of information:

The state of bit k-1 (when k == 0, consider bit -1 always set), and

Whether the higher-order bits are zero or non-zero (i.e. is count >= 2^(k+1)).
There are six states we distinguish. "x" represents some arbitrary bits, and "y" represents some arbitrary non-zero bits:

0: 00x: 0 pending of size 2^k; x pending of sizes < 2^k

1: 01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k

2: x10x: 0 pending of size 2^k; 2^k + x pending of sizes < 2^k

3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k

4: y00x: 1 pending of size 2^k; 2^k + x pending of sizes < 2^k

5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
(merge and loop back to state 2)
We gain lists of size 2^k in the 2->3 and 4->5 transitions (because bit k-1 is set while the more significant bits are non-zero) and merge them away in the 5->2 transition. Note in particular that just before the 5->2 transition, all lower-order bits are 11 (state 3), so there is one list of each smaller size.
When we reach the end of the input, we merge all the pending lists, from smallest to largest. If you work through cases 2 to 5 above, you can see that the number of elements we merge with a list of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).

Data structure invariants:

All lists are singly linked and null-terminated; prev pointers are not maintained.

pending is a prev-linked "list of lists" of sorted sublists awaiting further merging.

Each of the sorted sublists is power-of-two in size.

Sublists are sorted by size and age, smallest & newest at front.

There are zero to two sublists of each size.

A pair of pending sublists are merged as soon as the number of following pending elements equals their size (i.e. each time count reaches an odd multiple of that size). That ensures each later final merge will be at worst 2:1.

Each round consists of:

Merging the two sublists selected by the highest bit which flips when count is incremented, and Adding an element from the input as a size-1 sublist.

我觀察到三個重點

藉由對 count 的 bitwise 操作（6 個狀態，每當 pending 的數量符合預期，就進行合併），達到接近（但不大於）2:1 的合併，註解寫到可以減少 0.2n 次比較，雖然還是不如 fully-eager bottom-up mergesort

有點類似 state machine ，搭配 kdnvt同學的證明（雖然沒有看懂），發現是很縝密的數學

每個 sublist 的長度都是 2 的次方，使得同時間放在 cache 中的資料長度為
$3 * 2^{k}$ 可以通通放進 cache (註解寫到可以防止 cache thrashing)，同時也用來判斷該不該進行合併（配合第一點的 count）

我認知的 cache thrashing 是要重複替換在 cache line 中的資料，不太了解跟
$2^{k}$ 的關係

在合併過程中不維護 prev pointer 也不組成 cycle，最後再處理即可 => 可以減少指令數

引入到 lab0-c 專案

將 list_sort.c 與 list_sort.h 移植到專案中，
並參考 SmallHanley 同學對於 likely 以及 unlikely 的處理方式，新增兩個巨集

# define likely(x) __builtin_expect(!!(x), 1)            
# define unlikely(x) __builtin_expect(!!(x), 0)

然後移除所有的 __attribute__((nonnull(x, y))) （參數檢查）
以及改寫 list_cmp_func_t ，將第一個參數 void *priv (private data) 移除

最後在 qtest.c 中新增命令 lsort 與函式 do_lsort 並使用移植過來的 list_sort 來排序

int cmp(struct list_head *a, struct list_head *b)
{   
    return strcmp(list_entry(a, element_t, list)->value, list_entry(b, element_t, list)->value);
}

bool do_lsort(int argc, char *argv[])
{
    if (argc != 1) {
        report(1, "%s takes no arguments", argv[0]);
        return false;
    }

    if (!l_meta.l)
        report(3, "Warning: Calling sort on null queue");
    error_check();

    int cnt = q_size(l_meta.l);
    if (cnt < 2)
        report(3, "Warning: Calling sort on single node");
    error_check();

    list_cmp_func_t lcmp = (list_cmp_func_t)&cmp;

    set_noallocate_mode(true);
    if (exception_setup(true))
        list_sort(l_meta.l, lcmp);
    exception_cancel();
    set_noallocate_mode(false);

    bool ok = true;
    if (l_meta.size) {
        for (struct list_head *cur_l = l_meta.l->next;
             cur_l != l_meta.l && --cnt; cur_l = cur_l->next) {
            /* Ensure each element in ascending order */
            /* FIXME: add an option to specify sorting order */
            element_t *item, *next_item;
            item = list_entry(cur_l, element_t, list);
            next_item = list_entry(cur_l->next, element_t, list);
            if (strcasecmp(item->value, next_item->value) > 0) {
                report(1, "ERROR: Not sorted in ascending order");
                ok = false;
                break;
            }
        }
    }

    show_queue(3);
    return ok && !error_check();
}

效能比較

設計 trace-sort.cmd 與 trace-lsort.cmd 來進行測試

# trace-sort
option fail 0
option malloc 0
new
ih dolphin 500000
sort #lsort for trace-lsort
ih RAND 100000
sort #lsort for trace-lsort

使用 pref 來進行效能紀錄
命令： perf stat --repeat 10 -e branches,branch-misses,cache-references,cache-misses,instructions,cycles ./qtest -f <trace file>
結果

我的 merge_sort

 Performance counter stats for './qtest -f trace-sort.cmd' (10 runs):

       257,018,128      branches                                                      ( +-  1.13% )  (82.96%)
         1,707,051      branch-misses             #    0.66% of all branches          ( +-  0.77% )  (83.16%)
       110,697,814      cache-references                                              ( +-  0.75% )  (83.33%)
         6,790,606      cache-misses              #    6.134 % of all cache refs      ( +-  1.25% )  (83.49%)
     1,196,117,018      instructions              #    0.71  insn per cycle           ( +-  1.01% )  (83.65%)
     1,687,262,338      cycles                                                        ( +-  0.79% )  (83.41%)

           0.47257 +- 0.00573 seconds time elapsed  ( +-  1.21% )

Linux 的 list_sort

 Performance counter stats for './qtest -f trace-lsort.cmd' (10 runs):

       329,315,504      branches                                                      ( +-  0.94% )  (83.12%)
         3,532,435      branch-misses             #    1.07% of all branches          ( +-  0.83% )  (83.22%)
        86,018,106      cache-references                                              ( +-  0.41% )  (83.26%)
         5,567,120      cache-misses              #    6.472 % of all cache refs      ( +-  0.95% )  (83.42%)
     1,479,511,938      instructions              #    0.93  insn per cycle           ( +-  0.70% )  (83.67%)
     1,584,030,393      cycles                                                        ( +-  0.51% )  (83.30%)

           0.44128 +- 0.00383 seconds time elapsed  ( +-  0.87% )

分別跑 10000, 50000, 100000, 500000筆隨機測資和 500000筆相同測資(跑10次取平均)

詳細結果

10000

 Performance counter stats for './qtest -f trace-sort.cmd' (10 runs):

         5,593,870      branches                                                      ( +-  6.51% )  (59.94%)
           106,362      branch-misses             #    1.90% of all branches          ( +-  6.39% )  (67.67%)
         1,062,361      cache-references                                              ( +-  2.43% )  (89.23%)
           194,112      cache-misses              #   18.272 % of all cache refs      ( +-  2.36% )
        28,035,409      instructions              #    1.19  insn per cycle           ( +-  0.04% )
        23,598,141      cycles                                                        ( +-  3.36% )  (83.16%)

          0.010288 +- 0.000675 seconds time elapsed  ( +-  6.56% )

 Performance counter stats for './qtest -f trace-lsort.cmd' (10 runs):

         6,012,970      branches                                                      ( +-  4.98% )  (50.65%)
           159,851      branch-misses             #    2.66% of all branches          ( +-  6.77% )  (75.37%)
           963,757      cache-references                                              ( +-  3.29% )  (93.25%)
           194,570      cache-misses              #   20.189 % of all cache refs      ( +-  1.41% )
        27,118,254      instructions              #    1.25  insn per cycle           ( +-  0.06% )
        21,760,239      cycles                                                        ( +-  2.80% )  (80.73%)

          0.008412 +- 0.000679 seconds time elapsed  ( +-  8.08% )

50000

Performance counter stats for './qtest -f trace-sort.cmd' (10 runs):

        27,880,195      branches                                                      ( +-  1.86% )  (81.27%)
           555,234      branch-misses             #    1.99% of all branches          ( +-  1.52% )  (82.45%)
         5,845,745      cache-references                                              ( +-  3.12% )  (83.06%)
         1,223,135      cache-misses              #   20.924 % of all cache refs      ( +-  2.41% )  (85.06%)
       149,999,368      instructions              #    0.68  insn per cycle           ( +-  0.93% )  (85.85%)
       220,961,824      cycles                                                        ( +-  1.35% )  (82.30%)

           0.07108 +- 0.00242 seconds time elapsed  ( +-  3.41% )

 Performance counter stats for './qtest -f trace-lsort.cmd' (10 runs):

        28,749,713      branches                                                      ( +-  1.24% )  (80.54%)
           902,886      branch-misses             #    3.14% of all branches          ( +-  1.50% )  (80.61%)
         4,966,511      cache-references                                              ( +-  2.34% )  (82.85%)
         1,326,930      cache-misses              #   26.718 % of all cache refs      ( +-  0.76% )  (86.24%)
       142,718,022      instructions              #    0.70  insn per cycle           ( +-  0.83% )  (86.42%)
       204,829,798      cycles                                                        ( +-  1.05% )  (83.35%)

           0.06218 +- 0.00133 seconds time elapsed  ( +-  2.14% )

100000

Performance counter stats for './qtest -f trace-sort.cmd' (10 runs):

        58,451,551      branches                                                      ( +-  1.24% )  (82.64%)
         1,023,503      branch-misses             #    1.75% of all branches          ( +-  1.95% )  (83.34%)
        13,488,726      cache-references                                              ( +-  0.97% )  (83.35%)
         2,710,467      cache-misses              #   20.094 % of all cache refs      ( +-  0.95% )  (83.57%)
       287,066,488      instructions              #    0.53  insn per cycle           ( +-  1.35% )  (83.94%)
       537,631,546      cycles                                                        ( +-  0.74% )  (83.16%)

           0.15654 +- 0.00248 seconds time elapsed  ( +-  1.59% )

 Performance counter stats for './qtest -f trace-lsort.cmd' (10 runs):

        59,243,074      branches                                                      ( +-  0.55% )  (82.27%)
         1,698,227      branch-misses             #    2.87% of all branches          ( +-  0.96% )  (82.46%)
        11,503,090      cache-references                                              ( +-  0.94% )  (82.73%)
         2,972,868      cache-misses              #   25.844 % of all cache refs      ( +-  0.52% )  (83.95%)
       274,866,960      instructions              #    0.57  insn per cycle           ( +-  0.51% )  (84.76%)
       485,683,347      cycles                                                        ( +-  0.65% )  (83.83%)

           0.14038 +- 0.00246 seconds time elapsed  ( +-  1.75% )

500000

 Performance counter stats for './qtest -f trace-sort.cmd' (10 runs):

       303,464,710      branches                                                      ( +-  0.20% )  (83.14%)
         5,738,668      branch-misses             #    1.89% of all branches          ( +-  0.44% )  (83.23%)
        79,618,128      cache-references                                              ( +-  0.88% )  (83.36%)
        17,593,359      cache-misses              #   22.097 % of all cache refs      ( +-  0.39% )  (83.45%)
     1,470,930,563      instructions              #    0.42  insn per cycle           ( +-  0.24% )  (83.49%)
     3,474,129,943      cycles                                                        ( +-  0.33% )  (83.32%)

           0.95146 +- 0.00266 seconds time elapsed  ( +-  0.28% )

 Performance counter stats for './qtest -f trace-lsort.cmd' (10 runs):

       310,717,746      branches                                                      ( +-  0.40% )  (83.09%)
         9,569,091      branch-misses             #    3.08% of all branches          ( +-  0.25% )  (83.30%)
        64,293,293      cache-references                                              ( +-  0.25% )  (83.35%)
        18,789,900      cache-misses              #   29.225 % of all cache refs      ( +-  0.14% )  (83.41%)
     1,416,507,591      instructions              #    0.48  insn per cycle           ( +-  0.37% )  (83.49%)
     2,951,098,672      cycles                                                        ( +-  0.26% )  (83.36%)

           0.80188 +- 0.00252 seconds time elapsed  ( +-  0.31% )

500000 same

 Performance counter stats for './qtest -f trace-sort.cmd' (10 runs):

       159,219,808      branches                                                      ( +-  0.88% )  (82.70%)
           385,539      branch-misses             #    0.24% of all branches          ( +-  1.32% )  (83.21%)
        50,093,662      cache-references                                              ( +-  0.97% )  (83.36%)
         1,931,028      cache-misses              #    3.855 % of all cache refs      ( +-  1.09% )  (83.49%)
       751,876,531      instructions              #    1.25  insn per cycle           ( +-  0.43% )  (84.04%)
       601,405,011      cycles                                                        ( +-  0.82% )  (83.21%)

           0.17093 +- 0.00219 seconds time elapsed  ( +-  1.28% )

 Performance counter stats for './qtest -f trace-lsort.cmd' (10 runs):

       194,017,598      branches                                                      ( +-  0.40% )  (82.56%)
           890,984      branch-misses             #    0.46% of all branches          ( +-  1.31% )  (82.70%)
        41,181,898      cache-references                                              ( +-  0.46% )  (83.37%)
         1,450,604      cache-misses              #    3.522 % of all cache refs      ( +-  0.85% )  (83.66%)
       903,242,190      instructions              #    1.37  insn per cycle           ( +-  1.14% )  (84.31%)
       658,638,789      cycles                                                        ( +-  0.49% )  (83.40%)

           0.18613 +- 0.00240 seconds time elapsed  ( +-  1.29% )

sort＼資料數	10000	50000	100000	500000	500000(same)
我的	0.010288s	0.07108s	0.15654s	0.95146s	0.17093s
Linux的	0.008412s	0.06218s	0.14038s	0.80188s	0.18613s
比值	1.22301	1.14313	1.11512	1.18654	0.91834

可以發現，除了測資都相同的情形，其餘的 Linux sort 的效能皆是我的 1.1~1.2 倍

我的版本對已排序的測資有特別判斷以及簡化處理

仔細觀察，發現我的版本都擁有比較少的 branch 數以及 branch-miss 率

我認為是歸功於 pointer to pointer 的使用，每輪減少了一次的 branch

也有較多的 cache-reference 以及較少的 cache-miss 率
在 cache 的使用上是比較優異的
但時間上依舊輸給 Linux 的版本
發現 instruction 數跟 cycle 數皆比較高，也就是執行了較多的指令，因而花費更多時間

程式流程太複雜，可以考慮合併途中不進行 circular doubly-linked list 的維護

TODO: 測試比較次數，畢竟 merge-sort 的效率很大部份取決於進行比較的次數（演算法設計）

開發過程 - qtest.c

提供新的命令 shuffle

參考 Fisher-Yates Shuffle 撰寫一個

O (n^{2})

的 shuffle 實作

曾考慮先把每個節點用 table 紀錄，用查找的方式降低複雜度
不過更新 table 會比較麻煩，就放棄了

static bool do_shuffle(int argc, char *argv[])
{
    if (argc != 1) {
        report(1, "%s takes no arguments", argv[0]);
        return false;
    }

    if (!l_meta.l)
        report(3, "Warning: Try to access null queue");
    error_check();

    /* Fisher-Yates Shuffle */
    srand(time(NULL));
    size_t size = l_meta.size;
    // size >= 0
    for (; size; --size) {
        /*
         * random range: [0, size-1]
         * => rand() % ((size-1) - 0 + 1) + 0
         */
        size_t r = (size_t) rand() % size;
        struct list_head *p = l_meta.l->next;
        for (; r; --r, p = p->next)
            ; // clang-format style ><
        // remove and add to tail
        list_move_tail(p, l_meta.l);
    }

    show_queue(3);
    return !error_check();
}

在 console_init 中加入

ADD_COMMAND(shuffle, "                | Shuffle the queue");

成為新的命令