Link List Sort - 2

# Link List Sort - 2 contributed by < `jhan1998` > ## Linked-List merge sort ### Principle ```c void list_merge_sort(queue_t *q) { if (list_is_singular(&q->list)) return; queue_t left; struct list_head sorted; INIT_LIST_HEAD(&left.list); list_cut_position(&left.list, &q->list, &get_middle(&q->list)->list); list_merge_sort(&left); list_merge_sort(q); list_merge(&left.list, &q->list, &sorted); INIT_LIST_HEAD(&q->list); list_splice_tail(&sorted, &q->list); } ``` The abort condition can be seen at the beginning of `list_merge_sort()`: ```c if (list_is_singular(&q->list)) return; ``` ```c static inline int list_is_singular(const struct list_head *head) { return (!list_empty(head) && head->prev == head->next); } ``` Here `list_is_singular(&q->list)` means whether `list` is empty or has only one element. Then enter the partition part, you can see `list_cut_position()` at the beginning, you can know from the original code: ```c static inline void list_cut_position(struct list_head *head_to, struct list_head *head_from, struct list_head *node) { struct list_head *head_from_first = head_from->next; if (list_empty(head_from)) return; if (head_from == node) { INIT_LIST_HEAD(head_to); return; } head_from->next = node->next; head_from->next->prev = head_from; head_to->prev = node; node->next = head_to; head_to->next = head_from_first; head_to->next->prev = head_to; } ``` `list_cut_position()` This function mainly cuts a `list` called `head_from` at a given `node` position, and distributes it to another `list` called `head_to`. 1. ```graphviz digraph{ rankdir = LR; node[shape = box] t1 [label = "head_to"]; t1 -> t1:n; t1 -> t1:n; e1 [label = "head_from"]; e2 [label = "a (head_from_first)"]; e3 [label = "b"]; e4 [label = "node"]; e5 [label = "c"]; e6 [label = "d"]; e1 -> e2 -> e3 -> e4 -> e5 -> e6 -> e1:n; e2 -> e1; e3 -> e2; e4 -> e3; e5 -> e4; e6 -> e5; e1 -> e6; } ``` 2. ```graphviz digraph{ rankdir = LR; node[shape = box] t1 [label = "head_to"]; t1 -> t1; t1 -> t1; e1 [label = "head_from"]; e2 [label = "a (head_from_first)"]; e3 [label = "b"]; e4 [label = "node"]; e5 [label = "c (node->next)"]; e6 [label = "d"]; e2 -> e3 -> e4:n; e1 -> e5 -> e6:n; e2 -> e1; e3 -> e2; e4 -> e3; e5 -> e1; e6 -> e5; e1 -> e6; } ``` 3. ```graphviz digraph{ rankdir = LR; node[shape = box] t1 [label = "head_to"]; e1 [label = "head_from"]; e2 [label = "a (head_from_first)"]; e3 [label = "b"]; e4 [label = "node"]; e5 [label = "c"]; e6 [label = "d"]; e1 -> e5 -> e6 -> e1:n; t1 -> e2 -> e3 -> e4 -> t1:n; e2 -> t1; e3 -> e2; e4 -> e3; t1 -> e4; e5 -> e1; e6 -> e5; e1 -> e6; } ``` After that, it enters the recursive and repeated division until it can no longer be divided, and then enters the link of `list_merge()`: ```c static void list_merge(struct list_head *lhs, struct list_head *rhs, struct list_head *head) { INIT_LIST_HEAD(head); if (list_empty(lhs)) { list_splice_tail(lhs, head); return; } if (list_empty(rhs)) { list_splice_tail(rhs, head); return; } while (!list_empty(lhs) && !list_empty(rhs)) { char *lv = list_entry(lhs->next, list_ele_t, list)->value; char *rv = list_entry(rhs->next, list_ele_t, list)->value; struct list_head *tmp = strcmp(lv, rv) <= 0 ? lhs->next : rhs->next; list_del(tmp); list_add_tail(tmp, head); } list_splice_tail(list_empty(lhs) ? rhs : lhs, head); } ``` The function of `list_splice_tail()` is to tie a list behind another list. :::warning If (list_empty(lhs)) and if (list_empty(lhs)) in `list_merge()` should judge whether left or right is empty, if it is empty, connect another list after the head, so it should be changed to this ```c if (list_empty(lhs)) { list_splice_tail(rhs, head); return; } if (list_empty(rhs)) { list_splice_tail(lhs, head); return; } ``` ::: ### improvement ```c static void list_merge(struct list_head *lhs, struct list_head *rhs, struct list_head *head) { INIT_LIST_HEAD(head); if (list_empty(lhs)) { list_splice_tail(lhs, head); return; } if (list_empty(rhs)) { list_splice_tail(rhs, head); return; } while (!list_empty(lhs) && !list_empty(rhs)) { char *lv = list_entry(lhs->next, list_ele_t, list)->value; char *rv = list_entry(rhs->next, list_ele_t, list)->value; struct list_head *tmp = strcmp(lv, rv) <= 0 ? lhs->next : rhs->next; list_del(tmp); list_add_tail(tmp, head); } list_splice_tail(list_empty(lhs) ? rhs : lhs, head); } ``` In `list_merge()`, since line 15 will judge whether the linkde list is empty, the above if judgment can be deleted. According to the modification experience of [linux-list](https://github.com/sysprog21/linux-list), I think the `queue_t` structure is superfluous, as long as a `struct list_head head` is created to connect each `list_head` and then use `list_entry()` to browse the value of each `list_ele_t`. The corresponding modification is as follows: **insert_head** ```c bool insert_head(char *s, struct list_head *head) { list_ele_t *newh = malloc(sizeof(list_ele_t)); if (!newh) return false; char *new_value = strdup(s); if (!new_value) { free(newh); return false; } newh->value = new_value; list_add_tail(&newh->list, head); return true; } ``` **list_merge_sort** ```c void list_merge_sort(struct list_head *head) { if (list_empty(head) || list_is_singular(head)) return; struct list_head left; struct list_head sorted; INIT_LIST_HEAD(&left); list_cut_position(&left, head, get_middle(head)); list_merge_sort(&left); list_merge_sort(head); list_merge(&left, head, &sorted); INIT_LIST_HEAD(head); list_splice_tail(&sorted, head); } ``` **list_merge** ```c static void list_merge(struct list_head *lhs, struct list_head *rhs, struct list_head *head) { INIT_LIST_HEAD(head); while (!list_empty(lhs) && !list_empty(rhs)) { char *lv = list_entry(lhs->next, list_ele_t, list)->value; char *rv = list_entry(rhs->next, list_ele_t, list)->value; struct list_head *tmp = strcmp(lv, rv) <= 0 ? lhs->next : rhs->next; list_del(tmp); list_add_tail(tmp, head); } list_splice_tail(list_empty(lhs) ? rhs : lhs, head); } ``` **get_middle** ```c static struct list_head *get_middle(struct list_head *list) { struct list_head *fast = list->next, *slow; list_for_each (slow, list) { if (fast->next == list || fast->next->next == list) break; fast = fast->next->next; } return slow; } ``` **validate** ```c static bool validate(struct list_head *head, FILE *fp) { struct list_head *node; list_for_each (node, head) { fprintf(fp ,"%s",list_entry(node, list_ele_t, list)->value); if (node->next == head) break; if (strcmp(list_entry(node, list_ele_t, list)->value, list_entry(node->next, list_ele_t, list)->value) > 0) return false; } return true; } ``` **list_free** ```c void list_free(struct list_head * head) { for (struct list_head *node = (head)->next; node != (head);){ struct list_head *tmp = node; node = node->next; list_del(tmp); list_ele_t *entry = list_entry(tmp, list_ele_t, list); free(entry->value); free(entry); } } ``` Roughly use `list_head` `replace queue_t` for connection. It should be noted that `list_free` cannot directly use `list_for_each` to find `list_entry` and then free because when you free the space of `list_ele_t`, the list below will follow If it is cleared, the invalid read of size 8 error will occur when using valgrind. After improvement, the configuration space can be reduced. The following is a comparison: * Before change ``` ==29668== HEAP SUMMARY: ==29668== in use at exit: 0 bytes in 0 blocks ==29668== total heap usage: 187,657 allocs, 187,657 frees, 5,280,172 bytes allocated ``` * After change ``` ==29631== HEAP SUMMARY: ==29631== in use at exit: 0 bytes in 0 blocks ==29631== total heap usage: 187,658 allocs, 187,658 frees, 4,534,164 bytes allocated ``` ### Describes how lib/list_sort.c is optimized for the Linux kernel Referring to [sysprog21/linux-list](https://github.com/sysprog21/linux-list), put [lib/list_sort.c](https://github.com/torvalds/linux/blob/master /lib/list_sort.c) is extracted into a standalone use-level application: Divide the program into three parts `list.h`, `list_sort.h`, `list_sort.c`. First of all, the part of `list.h`, like the implementation in linux-list, is defined in the header file of `list.h`, the main structure and APIs used, such as `list_add_tail()`, `container_of`, as long as Extract part of the content from [linux/list.h](https://github.com/torvalds/linux/blob/master/include/linux/list.h). **list.h** ```c #ifndef _LINUX_LIST_H #define _LINUX_LIST_H #include <stdbool.h> #include <stddef.h> #include <stdint.h> #include <time.h> #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define true 1 #define false 0 /** * container_of() - Calculate address of object that contains address ptr * @ptr: pointer to member variable * @type: type of the structure containing ptr * @member: name of the member variable in struct @type * * Return: @type pointer of object containing ptr */ #ifndef container_of #define container_of(ptr, type, member) \ __extension__({ \ const __typeof__(((type *) 0)->member) *__pmember = (ptr); \ (type *) ((char *) __pmember - offsetof(type, member)); \ }) #endif /** * struct list_head - Head and node of a doubly-linked list * @prev: pointer to the previous node in the list * @next: pointer to the next node in the list */ struct list_head { struct list_head *prev, *next; }; struct listitem { uint16_t i; struct list_head list; }; /** * LIST_HEAD - Declare list head and initialize it * @head: name of the new object */ #define LIST_HEAD(head) struct list_head head = {&(head), &(head)} /** * INIT_LIST_HEAD() - Initialize empty list head * @head: pointer to list head */ static inline void INIT_LIST_HEAD(struct list_head *head) { head->next = head; head->prev = head; } /** * list_add_tail() - Add a list node to the end of the list * @node: pointer to the new node * @head: pointer to the head of the list */ static inline void list_add_tail(struct list_head *node, struct list_head *head) { struct list_head *prev = head->prev; prev->next = node; node->next = head; node->prev = prev; head->prev = node; } /** * list_del() - Remove a list node from the list * @node: pointer to the node */ static inline void list_del(struct list_head *node) { struct list_head *next = node->next, *prev = node->prev; next->prev = prev; prev->next = next; } /** * list_empty() - Check if list head has no nodes attached * @head: pointer to the head of the list */ static inline int list_empty(const struct list_head *head) { return (head->next == head); } /** * list_is_singular() - Check if list head has exactly one node attached * @head: pointer to the head of the list */ static inline int list_is_singular(const struct list_head *head) { return (!list_empty(head) && head->prev == head->next); } /** * list_splice_tail() - Add list nodes from a list to end of another list * @list: pointer to the head of the list with the node entries * @head: pointer to the head of the list */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { struct list_head *head_last = head->prev; struct list_head *list_first = list->next, *list_last = list->prev; if (list_empty(list)) return; head->prev = list_last; list_last->next = head; list_first->prev = head_last; head_last->next = list_first; } /** * list_cut_position() - Move beginning of a list to another list * @head_to: pointer to the head of the list which receives nodes * @head_from: pointer to the head of the list * @node: pointer to the node in which defines the cutting point */ static inline void list_cut_position(struct list_head *head_to, struct list_head *head_from, struct list_head *node) { struct list_head *head_from_first = head_from->next; if (list_empty(head_from)) return; if (head_from == node) { INIT_LIST_HEAD(head_to); return; } head_from->next = node->next; head_from->next->prev = head_from; head_to->prev = node; node->next = head_to; head_to->next = head_from_first; head_to->next->prev = head_to; } /** * list_entry() - Calculate address of entry that contains list node * @node: pointer to list node * @type: type of the entry containing the list node * @member: name of the list_head member variable in struct @type */ #define list_entry(node, type, member) container_of(node, type, member) /** * list_first_entry() - get first entry of the list * @head: pointer to the head of the list * @type: type of the entry containing the list node * @member: name of the list_head member variable in struct @type */ #define list_first_entry(head, type, member) \ list_entry((head)->next, type, member) /** * list_for_each - iterate over list nodes * @node: list_head pointer used as iterator * @head: pointer to the head of the list */ #define list_for_each(node, head) \ for (node = (head)->next; node != (head); node = node->next) /** * list_for_each_entry_safe - iterate over list entries and allow deletes * @entry: pointer used as iterator * @safe: @type pointer used to store info for next entry in list * @head: pointer to the head of the list * @member: name of the list_head member variable in struct type of @entry * * The current node (iterator) is allowed to be removed from the list. Any * other modifications to the the list will cause undefined behavior. * * FIXME: remove dependency of __typeof__ extension */ #define list_for_each_entry_safe(entry, safe, head, member) \ for (entry = list_entry((head)->next, __typeof__(*entry), member), \ safe = list_entry(entry->member.next, __typeof__(*entry), member); \ &entry->member != (head); entry = safe, \ safe = list_entry(safe->member.next, __typeof__(*entry), member)) static inline uint8_t getnum(void) { static uint16_t s1 = UINT16_C(2); static uint16_t s2 = UINT16_C(1); static uint16_t s3 = UINT16_C(1); s1 *= UINT16_C(171); s1 %= UINT16_C(30269); s2 *= UINT16_C(172); s2 %= UINT16_C(30307); s3 *= UINT16_C(170); s3 %= UINT16_C(30323); return s1 ^ s2 ^ s3; } static uint16_t get_unsigned16(void) { uint16_t x = 0; size_t i; for (i = 0; i < sizeof(x); i++) { x <<= 8; x |= getnum(); } return x; } static inline int cmpint(const void *p1, const void *p2) { const uint16_t *i1 = (const uint16_t *) p1; const uint16_t *i2 = (const uint16_t *) p2; return *i1 - *i2; } static inline int cmplist(void *arrange, struct list_head *a, struct list_head *b) { const uint16_t ai = container_of(a, struct listitem, list)->i; const uint16_t bi = container_of(b, struct listitem, list)->i; return ai - bi; } static inline void random_shuffle_array(uint16_t *operations, uint16_t len) { uint16_t i; uint16_t j; for (i = 0; i < len; i++) { /* WARNING biased shuffling */ j = get_unsigned16() % (i + 1); operations[i] = operations[j]; operations[j] = i; } } #endif ``` In list_sort.h, it basically defines the sorting function that will be used in list_sort.c. **list_sort.h** ```c #ifndef _LINUX_LIST_SORT_H #define _LINUX_LIST_SORT_H #include "list.h" typedef int __attribute__((nonnull(2,3))) (*cmp_func)(void *, struct list_head const *, struct list_head const *); __attribute__((nonnull(2,3,4))) static struct list_head *merge(void *priv, cmp_func cmp, struct list_head *a, struct list_head *b); __attribute__((nonnull(2,3,4,5))) static void merge_final(void *priv, cmp_func cmp, struct list_head *head, struct list_head *a, struct list_head *b); __attribute__((nonnull(2,3))) void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)); #endif ``` The main porting operation is located in list_sort.c, which contains the main implementation of list_sort and the correctness check. The following is the main function. **main** ```c int main(){ struct list_head testlist; struct listitem *item, *is = NULL; size_t i; FILE *fp = fopen("test.txt", "w"); random_shuffle_array(values, (uint16_t) ARRAY_SIZE(values)); INIT_LIST_HEAD(&testlist); assert(list_empty(&testlist)); for (i = 0; i < ARRAY_SIZE(values); i++) { item = (struct listitem *) malloc(sizeof(*item)); assert(item); item->i = values[i]; list_add_tail(&item->list, &testlist); } assert(!list_empty(&testlist)); list_for_each_entry_safe (item, is, &testlist, list) { fprintf(fp, "%d ", item->i); } fprintf(fp,"\n"); qsort(values, ARRAY_SIZE(values), sizeof(values[0]), cmpint); list_sort(NULL, &testlist, cmplist); i = 0; list_for_each_entry_safe (item, is, &testlist, list) { fprintf(fp, "%d ", item->i); assert(item->i == values[i]); list_del(&item->list); free(item); i++; } fprintf(fp, "\n"); fclose(fp); assert(i == ARRAY_SIZE(values)); assert(list_empty(&testlist)); return 0; } ``` #### 分析 Below we first analyze the annotations in the list_sort function in the linux core. >This mergesort is as eager as possible while always performing at least 2:1 balanced merges. Given two pending sublists of size 2^k, they are merged to a size-2^(k+1) list as soon as we have 2^k following elements. Thus, it will avoid cache thrashing as long as 3 * 2^k elements can fit into the cache. Not quite as good as a fully-eager bottom-up mergesort, but it does use 0.2*n fewer comparisons, so is faster in the common case that everything fits into L1. `list_sort()` mainly optimizes the hardware space. From the above, it can be seen that the merge will be done at a ratio of 2:1, so that when the cache capacity can accommodate $3 * 2^k$ elements can effectively avoid thrashing. When `list_sort()` splits, it will put the elements to be processed into another list called pending. Since it will try to merge in a 2:1 manner in the end, if you choose to merge two $2^k$ sublists There should also be $2^k$ elements in the future, so after all the lists are processed, there will be left $$ 2^k,2^{k-1},\dots,2^2,2^1 $$ With such a list column, the entire list can be concatenated in a ratio of 2:1 as mentioned in the comment. **e.g.** | count | pending | pending |pending |pending |follow | | -------- | -------- | -------- | -------- | -------- | -------- | | 0 | 1 | | | |1 | | 1 | 1 | 1 | | |1 | |2(merge) | 2 | 1 | | |1 | | 3 | 2 | 1 | 1 | |1 | | 4(merge) | 2 | 2 | 1 | |1 | | 5(merge) | 4 | 1 | 1 | |1 | | 6(merge) | 4 | 2 | 1 | |1 | | 7 | 4 | 2 | 1 | 1 |1 | | 8(merge) | 4 | 2 | 2 | 1 |1 | | 9(merge) | 4 | 4 | 1 | 1 |1 | | 10(merge) | 4 | 4 | 2 | 1 |1 | | 11(merge) | 8 | 2 | 1 | 1 |1 | | 12(merge) | 8 | 2 | 2 | 1 |1 | | 13(merge) | 8 | 4 | 1 | 1 |1 | | 14(merge) | 8 | 4 | 2 | 1 |done | If there are 15 elements will be arranged in the above form, you can see that they are also arranged in $2^3, 2^2, 2^1, 2^0$ at the end, and finally the merge from small to large is completed. This method can solve the unbalanced problem. Although it is not as good as fully-eager bottom-up mergesort, it uses fewer comparisons (0.2 * n), so it will be faster as long as it is suitable for L1 under normal circumstances. ### Test and performance comparison Compared to the quick sort implemented by quiz1: ![](https://i.imgur.com/hYyXyx2.png) Let's compare the memory allocated by heap and stack during the execution period between the two. #### heap **list_sort** ``` ==30058== HEAP SUMMARY: ==30058== in use at exit: 0 bytes in 0 blocks ==30058== total heap usage: 1,025,002 allocs, 1,025,002 frees, 26,628,648 bytes allocated ``` **quicksort** ``` ==30107== HEAP SUMMARY: ==30107== in use at exit: 0 bytes in 0 blocks ==30107== total heap usage: 1,025,002 allocs, 1,025,002 frees, 26,628,648 bytes allocated ``` The space configured by the two is the same here, because the heap space is mainly configured by malloc, so the heap space will not differ much under the same structure and the same number of lists. So here we focus on the performance of the stack during execution. #### stack Here we use `valgrind --tool=massif --stacks=yes`: **list_sort** ``` jhan1998@jhan1998-MacBookPro:~/linux2021/quiz2$ ms_print massif.out.30375 -------------------------------------------------------------------------------- Command: ./list_sort Massif arguments: --stacks=yes ms_print arguments: massif.out.30375 -------------------------------------------------------------------------------- KB 43.77^# : |#: ::: : : : : :::: : :: : : : :: :: ::::::: :: :: :: :: :: |#: ::: : :::::: :: :: :: ::::::: :::: :: : @: :::@:::::@ :: :@ :: :@ :: |#: ::: : :::::: :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ :: |#: ::: : :::::: :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ :: |#: ::: : :::::: :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ :: |#: ::: : ::::::: :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ :: |#: ::: : ::::::: :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ :: |#: ::: : ::::::: :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ :: |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ :: |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ :: |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ :: |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ :: |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ :: |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ :: |#::::: : ::::::: :: :: :: :: :::::::::::::: @: :::@:::::@ :: :@ :: :@ :: |#::::: : ::::::: :: :: :: :: :::::::::::::: @: :::@:::::@::: :@ :: :@ :: |#::::: :@:::::::::: :: :: :: :::::::::::::: @: :::@:::::@::: :@ :: :@ :: |#::::: :@:::::::::: :: :: :: :::::::::::::: @: :::@:::::@::: :@ :: :@ :: |#::::: :@:::::::::: :: :: :: :::::::::::::: @: :::@:::::@::: :@ :: :@ :: 0 +----------------------------------------------------------------------->Gi 0 1.274 Number of snapshots: 97 Detailed snapshots: [1 (peak), 10, 20, 23, 51, 61, 71, 81, 91] -------------------------------------------------------------------------------- n time(i) total(B) useful-heap(B) extra-heap(B) stacks(B) -------------------------------------------------------------------------------- 0 0 0 0 0 0 1 414,231 44,824 26,624 16,392 1,808 59.40% (26,624B) (heap allocation functions) malloc/new/new[], --alloc-fns, etc. ->54.83% (24,576B) 0x108F0A: main (in /home/jhan1998/linux2021/quiz2/list_sort) | ->04.57% (2,048B) 0x4E776DE: qsort_r (msort.c:222) ->04.57% (2,048B) 0x108FC0: main (in /home/jhan1998/linux2021/quiz2/list_sort) -------------------------------------------------------------------------------- n time(i) total(B) useful-heap(B) extra-heap(B) stacks(B) -------------------------------------------------------------------------------- 2 21,329,084 44,048 26,624 16,392 1,032 3 45,318,463 25,520 15,096 10,064 360 4 60,925,639 43,784 26,624 16,392 768 5 82,870,639 44,808 26,624 16,392 1,792 6 100,113,101 2,512 1,248 832 432 7 111,869,386 43,664 26,624 16,392 648 8 126,238,190 440 0 0 440 9 149,750,638 44,432 26,624 16,392 1,416 10 170,128,108 8,568 4,920 3,280 368 ``` ``` -------------------------------------------------------------------------------- n time(i) total(B) useful-heap(B) extra-heap(B) stacks(B) -------------------------------------------------------------------------------- 92 1,321,466,004 408 0 0 408 93 1,332,959,706 44,304 26,624 16,392 1,288 94 1,344,453,392 41,536 24,576 16,384 576 95 1,355,947,105 44,560 26,624 16,392 1,544 96 1,367,440,823 41,528 24,576 16,384 568 ``` **quicksort** ``` jhan1998@jhan1998-MacBookPro:~/linux2021/quiz2$ ms_print massif.out.30463 -------------------------------------------------------------------------------- Command: ./qsort Massif arguments: --stacks=yes ms_print arguments: massif.out.30463 -------------------------------------------------------------------------------- KB 44.30^ ## | : # :: : :: :: :::::@: @::: :: :: : |@@::: ::# ::: :::::: :: @@:::::: :::: : :: @::::@: ::@: : : :::: @ :: |@ : : ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: : : :: : @ :: |@ : : ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: : : :: ::@ :: |@ : : ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: : : :: ::@ :: |@ : : ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: : : :: ::@ :: |@ : : ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: : : :: ::@ :: |@ : : ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: : : :: ::@ :: |@ : : ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: |@ : :::::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: |@ : :: ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: |@ : :: ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: |@ : :: ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: |@ : :: ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: |@ : :: ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: |@ : :: ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: |@ : :: ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: |@ : :: ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: |@ : :: ::# :: : :::: :: @ :: ::: : :: : :: @::::@: ::@: ::: :: ::@ :: 0 +----------------------------------------------------------------------->Gi 0 2.076 Number of snapshots: 56 Detailed snapshots: [1, 7 (peak), 10, 16, 20, 29, 33, 38, 42, 51] -------------------------------------------------------------------------------- n time(i) total(B) useful-heap(B) extra-heap(B) stacks(B) -------------------------------------------------------------------------------- 0 0 0 0 0 0 1 27,812,220 42,256 24,576 16,384 1,296 58.16% (24,576B) (heap allocation functions) malloc/new/new[], --alloc-fns, etc. ->58.16% (24,576B) 0x1092F5: main (in /home/jhan1998/linux2021/quiz2/qsort) | ->00.00% (0B) in 1+ places, all below ms_print's threshold (01.00%) -------------------------------------------------------------------------------- n time(i) total(B) useful-heap(B) extra-heap(B) stacks(B) -------------------------------------------------------------------------------- 2 73,150,427 41,936 24,576 16,384 976 3 138,499,140 44,064 24,576 16,384 3,104 4 179,888,135 23,672 13,944 9,296 432 5 216,949,699 42,080 24,576 16,384 1,120 6 248,452,079 41,936 24,576 16,384 976 7 300,748,565 45,360 24,576 16,384 4,400 ``` ``` -------------------------------------------------------------------------------- n time(i) total(B) useful-heap(B) extra-heap(B) stacks(B) -------------------------------------------------------------------------------- 52 2,146,167,569 440 0 0 440 53 2,173,963,866 41,616 24,576 16,384 656 54 2,201,760,176 43,264 24,576 16,384 2,304 55 2,229,556,490 43,776 24,576 16,384 2,816 ``` It can be seen that on average, the execution cost of quicksort will be relatively high, probably more than 1,000. As for list_sort, except for some places, it will fluctuate to more than 1,000, but the average is maintained at 4-500. --- ## Bit field operation ### Principles ```c uint16_t func(uint16_t N) { /* change all right side bits to 1 */ N |= N >> 1; N |= N >> 2; N |= N >> 4; N |= N >> 8; return (N + 1) >> 1; } ``` The key to this question is to understand the meaning of return (N + 1) >> 1, so it can be judged that (N+1) should be $2^n - 1$, like this: ``` #N 1111 #N+1 10000 #(N + 1) >> 1 1000 ``` So to make all bits 1, observe the following special case of 16 bits unsigned integer: ``` #N 1000000000000000 #N |= N >> 1 1100000000000000 #N |= N >> 2 1111000000000000 #N |= N >> 4 1111111100000000 #N |= N >> 8 1111111111111111 ``` Starting from n = 0, it is necessary to move 2 to the nth power of bits at a time to achieve the result that all bits are 1. ### is_power_of_2 In `linux/include/linux/log2.h` `is_power_of_2` is defined like this: ```c /* * Determine whether some value is a power of two, where zero is * *not* considered a power of two. */ static inline __attribute__((const)) bool is_power_of_2(unsigned long n) { return (n != 0 && ((n & (n - 1)) == 0)); } ``` In addition to judging non-zero, there is also judging whether n & (n - 1) is 0, for example: $16_{10} = 10000_2$ $16_{10} - 1 = 01111_2$ $16_{10}\ \& \ (16_{10} - 1) = 00000_2$ Therefore, it can be judged whether n is a power of 2 or not. ### round up/down power of 2 **__roundup_pow_of_two** ```c /* * round up to nearest power of two */ static inline __attribute__((const)) unsigned long __roundup_pow_of_two(unsigned long n) { return 1UL << fls_long(n - 1); } ``` **__rounddown_pow_of_two** ```c /* * round down to nearest power of two */ static inline __attribute__((const)) unsigned long __rounddown_pow_of_two(unsigned long n) { return 1UL << (fls_long(n) - 1); } ``` **fls_long** ```c static inline unsigned fls_long(unsigned long l) { if (sizeof(l) == 4) return fls(l); return fls64(l); } ``` `fls()` means find last set, which means to find the last 1 counted from LSB. e.g. fls(0x80000000) = 32, fls(0x00000001) = 1 Therefore, `__roundup_pow_of_two` uses `1UL <<(fls_long(n) - 1)` to make the leftmost 1 one bit to the left, and `__rounddown_pow_of_two` returns to the original leftmost 1 position, if it is 2 Power of , both return the same value. --- ## bitcpy() implementation ### Principle Here I am going to analyze the code of `bitcpy()` step by step ```c size_t read_lhs = _read & 7; size_t read_rhs = 8 - read_lhs; const uint8_t *source = (const uint8_t *) _src + (_read / 8); size_t write_lhs = _write & 7; size_t write_rhs = 8 - write_lhs; uint8_t *dest = (uint8_t *) _dest + (_write / 8); ``` At the beginning, this is mainly to prepare for reading data later, and handle the offset position, offset and alignment. ```c uint8_t data = *source++; size_t bitsize = (count > 8) ? 8 : count; if (read_lhs > 0) { data <<= read_lhs; if (bitsize > read_rhs) data |= (*source >> read_rhs); } ``` Here is to read from the specified source location, from `size_t bitsize = (count > 8) ? 8 : count;`, you can know that only one byte of data is processed each time, first judge whether `read_lhs > 0` is In order to confirm whether the beginning of the data to be read is aligned with the beginning of the source, if it is non-zero, the data that has been assigned in advance must be shifted to the left, so as to align with the correct beginning of `_read`, and then because it is moved to the next One byte area, so also shift `read_rhs` bits to the right so as to ensure to get 8 consecutive bits. The next step is to judge, how many bits should be taken in total, and then the unnecessary data should be eliminated for the mask. ```c if (bitsize < 8) data &= read_mask[bitsize]; ``` write part： ```c uint8_t original = *dest; uint8_t mask = read_mask[write_lhs]; if (bitsize > write_rhs) { /* Cross multiple bytes */ *dest++ = (original & mask) | (data >> write_lhs); original = *dest & write_mask[bitsize - write_rhs]; *dest = original | (data << write_rhs); } else { // Since write_lhs + bitsize is never >= 8, no out-of-bound access. mask |= write_mask[write_lhs + bitsize]; *dest++ = (original & mask) | (data >> write_lhs); } ``` First set the mask and set the reserved data. If `bitsize > write_rh`, it means that he needs to write the data in two stages, where `original & mask` means to clear the bits to be saved, and then save 8 - lhs The right half of the bits range, `original = *dest & write_mask[bitsize - write_rhs];` This step is also to clear the bits to be stored, and finally move to the left to store the remaining bits. . In the else part, since it can be stored in `mask |= write_mask[write_lhs + bitsize];` at one time, this step is mainly to judge which write bits are to be written and to be cleared. Finally, it is the same as above, move right to the beginning The bits are stored in --- ## Cstring operation ### Principle From the test function: ```c static void test_cstr() { CSTR_BUFFER(a); cstr_cat(a, "Hello "); cstr_cat(a, "string"); cstring b = cmp(CSTR_S(a)); printf("%s\n", b->cstr); CSTR_CLOSE(a); cstr_release(b); } ``` First `CSTR_BUFFER()` will first create an array of CSTR_STACK_SIZE to put the string, and then initialize a `__cstr_data` object to let the cstr inside point to the above space, and then create a `cstr_buffer` so that the `cstring str` inside can point to the above space `__cstr_data`, here because `cstring` is `struct __cstr_data *` but the above is declared and initialized with `struct __cstr_data`, so it is `var->str = &var##_cstr_data;`. ```c #define CSTR_BUFFER(var) \ char var##_cstring[CSTR_STACK_SIZE] = {0}; \ struct __cstr_data var##_cstr_data = {var##_cstring, 0, CSTR_ONSTACK, 0}; \ cstr_buffer var; \ var->str = &var##_cstr_data; ``` Next into `cstr_cat()`: ```c cstring cstr_cat(cstr_buffer sb, const char *str) { cstring s = sb->str; if (s->type == CSTR_ONSTACK) { int i = s->hash_size; while (i < CSTR_STACK_SIZE - 1) { s->cstr[i] = *str; if (*str == 0) return s; ++s->hash_size; ++str; ++i; } s->cstr[i] = 0; } cstring tmp = s; sb->str = cstr_cat2(tmp->cstr, str); cstr_release(tmp); return sb->str; } ``` `cstr_cat()` will add the string to the specified `cstr_buffer`, then there will be two situations: 1. The size is smaller than `CSTR_STACK_SIZE`, so as long as it is directly stored in the buffer inside the cstring, it will be fine. 2. If the length is greater than or equal to `CSTR_STACK_SIZE`, it will enter `cstr_cat()` ```c static cstring cstr_cat2(const char *a, const char *b) { size_t sa = strlen(a), sb = strlen(b); if (sa + sb < CSTR_INTERNING_SIZE) { char tmp[CSTR_INTERNING_SIZE]; memcpy(tmp, a, sa); memcpy(tmp + sa, b, sb); tmp[sa + sb] = 0; return cstr_interning(tmp, sa + sb, hash_blob(tmp, sa + sb)); } cstring p = xalloc(sizeof(struct __cstr_data) + sa + sb + 1); if (!p) return NULL; char *ptr = (char *) (p + 1); p->cstr = ptr; p->type = 0; p->ref = 1; memcpy(ptr, a, sa); memcpy(ptr + sa, b, sb); ptr[sa + sb] = 0; p->hash_size = 0; return p; } ``` `cstr_cat2()` will first try to intern string. When the length is less than `CSTR_INTERNING_SIZE`, if it is larger, it will give him a place in the heap to put the string in. At this time, because the length is too long, the entire string will exist in the entire structure Later, set type and hash_size to 0 and ref to 1. :::warning The strange thing here is that CSTR_INTERNING_SIZE is 32 and CSTR_STACK_SIZE is 128, so str_cat2() should be a string longer than CSTR_STACK_SIZE, so it will never enter the intern? ::: Then, look at `cstr_interning()` again: ```c static cstring cstr_interning(const char *cstr, size_t sz, uint32_t hash) { cstring ret; CSTR_LOCK(); ret = interning(&__cstr_ctx, cstr, sz, hash); if (!ret) { expand(&__cstr_ctx); ret = interning(&__cstr_ctx, cstr, sz, hash); } ++__cstr_ctx.total; CSTR_UNLOCK(); return ret; } ``` Here, in order to prevent multiple execution sequences from interning cstr_ctx at the same time, there are lock protection measures. After entering interning, if the hash_table has not been established or the space is insufficient, `expand(&__cstr_ctx)` will be used to increase the space. ```c static cstring interning(struct __cstr_interning *si, const char *cstr, size_t sz, uint32_t hash) { if (!si->hash) return NULL; ``` At the beginning, it detects whether `hash_table` has been created, if not, return NULL and then hand it over to `expand()` to initialize the table. ```c int index = (int) (hash & (si->size - 1)); struct __cstr_node *n = si->hash[index]; while (n) { if (n->str.hash_size == hash) { if (!strcmp(n->str.cstr, cstr)) return &n->str; } n = n->next; } ``` Calculate the index value, the fields with the same index will be connected by a linked list, then confirm the hash_size and then confirm that the strings are not equal, and return if found. ```c // 80% (4/5) threshold if (si->total * 5 >= si->size * 4) return NULL; if (!si->pool) { si->pool = xalloc(sizeof(struct __cstr_pool)); si->index = 0; } ``` Because it is not found here, it must be stored in `hash_table`, so first check whether `hash_table` has reached 80% capacity, if it exceeds, `expand()`, and if the pool has not been initialized, it will allocate a space to node, so It can avoid frequent malloc and make locality better. ```c n = &si->pool->node[si->index++]; memcpy(n->buffer, cstr, sz); n->buffer[sz] = 0; cstring cs = &n->str; cs->cstr = n->buffer; cs->hash_size = hash; cs->type = CSTR_INTERNING; cs->ref = 0; n->next = si->hash[index]; si->hash[index] = n; return cs; } ``` The last is to update the various types in the string, and store them in the hash_table where the ref is set to 0, which means that the intern string will not be released until the end. Then enter the `cmp()` part: ```c static cstring cmp(cstring t) { CSTR_LITERAL(hello, "Hello string"); CSTR_BUFFER(ret); cstr_cat(ret, cstr_equal(hello, t) ? "equal" : "not equal"); return cstr_grab(CSTR_S(ret)); } ``` `CSTR_LITERAL` will give a static string to give a value through clone, here `__sync_bool_compare_and_swap(&var, NULL, tmp)` is to prevent simultaneous execution of `CSTR_LITERAL()` in the case of multiple execution sequences, which will give two Space, so use `__sync_bool_compare_and_swap(&var, NULL, tmp)` to judge whether the value of `var` has been updated, if not, update and return true, so that subsequent execution sequences will return false. ```c #define CSTR_LITERAL(var, cstr) \ static cstring var = NULL; \ if (!var) { \ cstring tmp = cstr_clone("" cstr, (sizeof(cstr) / sizeof(char)) - 1); \ if (tmp->type == 0) { \ tmp->type = CSTR_PERMANENT; \ tmp->ref = 0; \ } \ if (!__sync_bool_compare_and_swap(&var, NULL, tmp)) { \ if (tmp->type == CSTR_PERMANENT) \ free(tmp); \ } \ } ``` `cstr_clone()` is quite similar to `cstr_cat()`, first determine the size before choosing whether to intern or configure it on the heap. The contradiction here is that if the string is too long, the ref will be set to 1, but when it is returned, it will be Literal It will set him back to 0, quite redundant. ```c cstring cstr_clone(const char *cstr, size_t sz) { if (sz < CSTR_INTERNING_SIZE) return cstr_interning(cstr, sz, hash_blob(cstr, sz)); cstring p = xalloc(sizeof(struct __cstr_data) + sz + 1); if (!p) return NULL; void *ptr = (void *) (p + 1); p->cstr = ptr; p->type = 0; p->ref = 1; memcpy(ptr, cstr, sz); ((char *) ptr)[sz] = 0; p->hash_size = 0; return p; } ``` Finally, let's see the part of `cstr_equal()` and `cstr_grab()`: ```c int cstr_equal(cstring a, cstring b) { if (a == b) return 1; if ((a->type == CSTR_INTERNING) && (b->type == CSTR_INTERNING)) return 0; if ((a->type == CSTR_ONSTACK) && (b->type == CSTR_ONSTACK)) { if (a->hash_size != b->hash_size) return 0; return memcmp(a->cstr, b->cstr, a->hash_size) == 0; } uint32_t hasha = cstr_hash(a); uint32_t hashb = cstr_hash(b); if (hasha != hashb) return 0; return !strcmp(a->cstr, b->cstr); } ``` `cstr_equal` will calculate whether a and b point to the same object. This is to test whether it is a string in intern, so if the type is both `CSTR_INTERNING`, then a and b must not be the same string, and then If both are in the stack, because if the string is recorded in stck, the length of the string will be recorded, so the string will be compared first, and then whether the content is the same, and finally if the string is too long or the type is different, first The calculated hash value is different, and the strings are compared. ```c cstring cstr_grab(cstring s) { if (s->type & (CSTR_PERMANENT | CSTR_INTERNING)) return s; if (s->type == CSTR_ONSTACK) return cstr_clone(s->cstr, s->hash_size); if (s->ref == 0) s->type = CSTR_PERMANENT; else __sync_add_and_fetch(&s->ref, 1); return s; } ``` `grab` will return the cstring before returning it, it will judge the type if it is in the stack, enter the clone and throw it in the intern, and finally if the ref is not 0, then +1.