Try   HackMD

Link List Sort - 2

contributed by < jhan1998 >

Linked-List merge sort

Principle

void list_merge_sort(queue_t *q)
{
    if (list_is_singular(&q->list))
        return;

    queue_t left;
    struct list_head sorted;
    INIT_LIST_HEAD(&left.list);
    list_cut_position(&left.list, &q->list, &get_middle(&q->list)->list);
    list_merge_sort(&left);
    list_merge_sort(q);
    list_merge(&left.list, &q->list, &sorted);
    INIT_LIST_HEAD(&q->list);
    list_splice_tail(&sorted, &q->list);
}

The abort condition can be seen at the beginning of list_merge_sort():

if (list_is_singular(&q->list))
    return;
static inline int list_is_singular(const struct list_head *head)
{
    return (!list_empty(head) && head->prev == head->next);
}

Here list_is_singular(&q->list) means whether list is empty or has only one element.

Then enter the partition part, you can see list_cut_position() at the beginning, you can know from the original code:

static inline void list_cut_position(struct list_head *head_to,
                                     struct list_head *head_from,
                                     struct list_head *node)
{
    struct list_head *head_from_first = head_from->next;

    if (list_empty(head_from))
        return;

    if (head_from == node) {
        INIT_LIST_HEAD(head_to);
        return;
    }

    head_from->next = node->next;
    head_from->next->prev = head_from;

    head_to->prev = node;
    node->next = head_to;
    head_to->next = head_from_first;
    head_to->next->prev = head_to;
}

list_cut_position() This function mainly cuts a list called head_from at a given node position, and distributes it to another list called head_to.







%0



t1

head_to



t1->t1:n





t1->t1:n





e1

head_from



e2

a (head_from_first)



e1->e2





e6

d



e1->e6





e2->e1





e3

b



e2->e3





e3->e2





e4

node



e3->e4





e4->e3





e5

c



e4->e5





e5->e4





e5->e6





e6->e1:n





e6->e5











%0



t1

head_to



t1->t1





t1->t1





e1

head_from



e5

c (node->next)



e1->e5





e6

d



e1->e6





e2

a (head_from_first)



e2->e1





e3

b



e2->e3





e3->e2





e4

node



e3->e4:n





e4->e3





e5->e1





e5->e6:n





e6->e5











%0



t1

head_to



e2

a (head_from_first)



t1->e2





e4

node



t1->e4





e1

head_from



e5

c



e1->e5





e6

d



e1->e6





e2->t1





e3

b



e2->e3





e3->e2





e3->e4





e4->t1:n





e4->e3





e5->e1





e5->e6





e6->e1:n





e6->e5





After that, it enters the recursive and repeated division until it can no longer be divided, and then enters the link of list_merge():

static void list_merge(struct list_head *lhs,
                       struct list_head *rhs,
                       struct list_head *head)
{
    INIT_LIST_HEAD(head);
    if (list_empty(lhs)) {
        list_splice_tail(lhs, head);
        return;
    }
    if (list_empty(rhs)) {
        list_splice_tail(rhs, head);
        return;
    }

    while (!list_empty(lhs) && !list_empty(rhs)) {
        char *lv = list_entry(lhs->next, list_ele_t, list)->value;
        char *rv = list_entry(rhs->next, list_ele_t, list)->value;
        struct list_head *tmp = strcmp(lv, rv) <= 0 ? lhs->next : rhs->next;
        list_del(tmp);
        list_add_tail(tmp, head);
    }
    list_splice_tail(list_empty(lhs) ? rhs : lhs, head);
}

The function of list_splice_tail() is to tie a list behind another list.

If (list_empty(lhs)) and if (list_empty(lhs)) in list_merge() should judge whether left or right is empty, if it is empty, connect another list after the head, so it should be changed to this

if (list_empty(lhs)) {
    list_splice_tail(rhs, head);
    return;
}
if (list_empty(rhs)) {
    list_splice_tail(lhs, head);
    return;
}

improvement

static void list_merge(struct list_head *lhs,
                       struct list_head *rhs,
                       struct list_head *head)
{
    INIT_LIST_HEAD(head);
    if (list_empty(lhs)) {
        list_splice_tail(lhs, head);
        return;
    }
    if (list_empty(rhs)) {
        list_splice_tail(rhs, head);
        return;
    }

    while (!list_empty(lhs) && !list_empty(rhs)) {
        char *lv = list_entry(lhs->next, list_ele_t, list)->value;
        char *rv = list_entry(rhs->next, list_ele_t, list)->value;
        struct list_head *tmp = strcmp(lv, rv) <= 0 ? lhs->next : rhs->next;
        list_del(tmp);
        list_add_tail(tmp, head);
    }
    list_splice_tail(list_empty(lhs) ? rhs : lhs, head);
}

In list_merge(), since line 15 will judge whether the linkde list is empty, the above if judgment can be deleted.

According to the modification experience of linux-list, I think the queue_t structure is superfluous, as long as a struct list_head head is created to connect each list_head and then use list_entry() to browse the value of each list_ele_t.

The corresponding modification is as follows:
insert_head

bool insert_head(char *s, struct list_head *head)
{
    list_ele_t *newh = malloc(sizeof(list_ele_t));
    if (!newh)
        return false;

    char *new_value = strdup(s);
    if (!new_value) {
        free(newh);
        return false;
    }

    newh->value = new_value;
    list_add_tail(&newh->list, head);

    return true;
}

list_merge_sort

void list_merge_sort(struct list_head *head)
{
    if (list_empty(head) || list_is_singular(head))
        return;

    struct list_head left;
    struct list_head sorted;
    INIT_LIST_HEAD(&left);
    list_cut_position(&left, head, get_middle(head));
    list_merge_sort(&left);
    list_merge_sort(head);
    list_merge(&left, head, &sorted);
    INIT_LIST_HEAD(head);
    list_splice_tail(&sorted, head);
}

list_merge

static void list_merge(struct list_head *lhs,
                       struct list_head *rhs,
                       struct list_head *head)
{
    INIT_LIST_HEAD(head);
    
    while (!list_empty(lhs) && !list_empty(rhs)) {
        char *lv = list_entry(lhs->next, list_ele_t, list)->value;
        char *rv = list_entry(rhs->next, list_ele_t, list)->value;
        struct list_head *tmp = strcmp(lv, rv) <= 0 ? lhs->next : rhs->next;
        list_del(tmp);
        list_add_tail(tmp, head);
    }
    list_splice_tail(list_empty(lhs) ? rhs : lhs, head);
}

get_middle

static struct list_head *get_middle(struct list_head *list)
{
    struct list_head *fast = list->next, *slow;
    list_for_each (slow, list) {
        if (fast->next == list || fast->next->next == list)
            break;
        fast = fast->next->next;
    }
    return slow;
}

validate

static bool validate(struct list_head *head, FILE *fp)
{
    struct list_head *node;
    list_for_each (node, head) {
        fprintf(fp ,"%s",list_entry(node, list_ele_t, list)->value);
        if (node->next == head)
            break;
        if (strcmp(list_entry(node, list_ele_t, list)->value,
                   list_entry(node->next, list_ele_t, list)->value) > 0)
            return false;
    }
    return true;
}

list_free

void list_free(struct list_head * head) {
    for (struct list_head *node = (head)->next; node != (head);){
        struct list_head *tmp = node;
        node = node->next;
        list_del(tmp);
        list_ele_t *entry = list_entry(tmp, list_ele_t, list);
        free(entry->value);
        free(entry);
    }
}

Roughly use list_head replace queue_t for connection. It should be noted that list_free cannot directly use list_for_each to find list_entry and then free because when you free the space of list_ele_t, the list below will follow If it is cleared, the invalid read of size 8 error will occur when using valgrind.

After improvement, the configuration space can be reduced. The following is a comparison:

  • Before change
==29668== HEAP SUMMARY:
==29668==     in use at exit: 0 bytes in 0 blocks
==29668==   total heap usage: 187,657 allocs, 187,657 frees, 5,280,172 bytes allocated
  • After change
==29631== HEAP SUMMARY:
==29631==     in use at exit: 0 bytes in 0 blocks
==29631==   total heap usage: 187,658 allocs, 187,658 frees, 4,534,164 bytes allocated

Describes how lib/list_sort.c is optimized for the Linux kernel

Referring to sysprog21/linux-list, put [lib/list_sort.c](https://github.com/torvalds/linux/blob/master /lib/list_sort.c) is extracted into a standalone use-level application:

Divide the program into three parts list.h, list_sort.h, list_sort.c.

First of all, the part of list.h, like the implementation in linux-list, is defined in the header file of list.h, the main structure and APIs used, such as list_add_tail(), container_of, as long as Extract part of the content from linux/list.h.

list.h

#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <time.h>
#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#define true 1
#define false 0

/**
 * container_of() - Calculate address of object that contains address ptr
 * @ptr: pointer to member variable
 * @type: type of the structure containing ptr
 * @member: name of the member variable in struct @type
 *
 * Return: @type pointer of object containing ptr
 */
#ifndef container_of
#define container_of(ptr, type, member)                            \
    __extension__({                                                \
        const __typeof__(((type *) 0)->member) *__pmember = (ptr); \
        (type *) ((char *) __pmember - offsetof(type, member));    \
    })
#endif

/**
 * struct list_head - Head and node of a doubly-linked list
 * @prev: pointer to the previous node in the list
 * @next: pointer to the next node in the list
 */
struct list_head {
    struct list_head *prev, *next;
};

struct listitem {
    uint16_t i;
    struct list_head list;
};

/**
 * LIST_HEAD - Declare list head and initialize it
 * @head: name of the new object
 */
#define LIST_HEAD(head) struct list_head head = {&(head), &(head)}

/**
 * INIT_LIST_HEAD() - Initialize empty list head
 * @head: pointer to list head
 */
static inline void INIT_LIST_HEAD(struct list_head *head)
{
    head->next = head; head->prev = head;
}

/**
 * list_add_tail() - Add a list node to the end of the list
 * @node: pointer to the new node
 * @head: pointer to the head of the list
 */
static inline void list_add_tail(struct list_head *node, struct list_head *head)
{
    struct list_head *prev = head->prev;

    prev->next = node;
    node->next = head;
    node->prev = prev;
    head->prev = node;
}

/**
 * list_del() - Remove a list node from the list
 * @node: pointer to the node
 */
static inline void list_del(struct list_head *node)
{
    struct list_head *next = node->next, *prev = node->prev;
    next->prev = prev; prev->next = next;
}

/**
 * list_empty() - Check if list head has no nodes attached
 * @head: pointer to the head of the list
 */
static inline int list_empty(const struct list_head *head)
{
    return (head->next == head);
}

/**
 * list_is_singular() - Check if list head has exactly one node attached
 * @head: pointer to the head of the list
 */
static inline int list_is_singular(const struct list_head *head)
{
    return (!list_empty(head) && head->prev == head->next);
}

/**
 * list_splice_tail() - Add list nodes from a list to end of another list
 * @list: pointer to the head of the list with the node entries
 * @head: pointer to the head of the list
 */
static inline void list_splice_tail(struct list_head *list,
                                    struct list_head *head)
{
    struct list_head *head_last = head->prev;
    struct list_head *list_first = list->next, *list_last = list->prev;

    if (list_empty(list))
        return;

    head->prev = list_last;
    list_last->next = head;

    list_first->prev = head_last;
    head_last->next = list_first;
}

/**
 * list_cut_position() - Move beginning of a list to another list
 * @head_to: pointer to the head of the list which receives nodes
 * @head_from: pointer to the head of the list
 * @node: pointer to the node in which defines the cutting point
 */
static inline void list_cut_position(struct list_head *head_to,
                                     struct list_head *head_from,
                                     struct list_head *node)
{
    struct list_head *head_from_first = head_from->next;

    if (list_empty(head_from))
        return;

    if (head_from == node) {
        INIT_LIST_HEAD(head_to);
        return;
    }

    head_from->next = node->next;
    head_from->next->prev = head_from;

    head_to->prev = node;
    node->next = head_to;
    head_to->next = head_from_first;
    head_to->next->prev = head_to;
}

/**
 * list_entry() - Calculate address of entry that contains list node
 * @node: pointer to list node
 * @type: type of the entry containing the list node
 * @member: name of the list_head member variable in struct @type
 */
#define list_entry(node, type, member) container_of(node, type, member)

/**
 * list_first_entry() - get first entry of the list
 * @head: pointer to the head of the list
 * @type: type of the entry containing the list node
 * @member: name of the list_head member variable in struct @type
 */
#define list_first_entry(head, type, member) \
    list_entry((head)->next, type, member)

/**
 * list_for_each - iterate over list nodes
 * @node: list_head pointer used as iterator
 * @head: pointer to the head of the list
 */
#define list_for_each(node, head) \
    for (node = (head)->next; node != (head); node = node->next)

/**
 * list_for_each_entry_safe - iterate over list entries and allow deletes
 * @entry: pointer used as iterator
 * @safe: @type pointer used to store info for next entry in list
 * @head: pointer to the head of the list
 * @member: name of the list_head member variable in struct type of @entry
 *
 * The current node (iterator) is allowed to be removed from the list. Any
 * other modifications to the the list will cause undefined behavior.
 *
 * FIXME: remove dependency of __typeof__ extension
 */
#define list_for_each_entry_safe(entry, safe, head, member)                \
    for (entry = list_entry((head)->next, __typeof__(*entry), member),     \
        safe = list_entry(entry->member.next, __typeof__(*entry), member); \
         &entry->member != (head); entry = safe,                           \
        safe = list_entry(safe->member.next, __typeof__(*entry), member))

static inline uint8_t getnum(void)
{
    static uint16_t s1 = UINT16_C(2);
    static uint16_t s2 = UINT16_C(1);
    static uint16_t s3 = UINT16_C(1);

    s1 *= UINT16_C(171);
    s1 %= UINT16_C(30269);

    s2 *= UINT16_C(172);
    s2 %= UINT16_C(30307);

    s3 *= UINT16_C(170);
    s3 %= UINT16_C(30323);

    return s1 ^ s2 ^ s3;
}

static uint16_t get_unsigned16(void)
{
    uint16_t x = 0;
    size_t i;

    for (i = 0; i < sizeof(x); i++) {
        x <<= 8;
        x |= getnum();
    }

    return x;
}

static inline int cmpint(const void *p1, const void *p2)
{
    const uint16_t *i1 = (const uint16_t *) p1;
    const uint16_t *i2 = (const uint16_t *) p2;

    return *i1 - *i2;
}

static inline int cmplist(void *arrange, struct list_head *a, struct list_head *b) {
  const uint16_t ai = container_of(a, struct listitem, list)->i;
  const uint16_t bi = container_of(b, struct listitem, list)->i;

    return ai - bi;
}

static inline void random_shuffle_array(uint16_t *operations, uint16_t len)
{
    uint16_t i;
    uint16_t j;

    for (i = 0; i < len; i++) {
        /* WARNING biased shuffling */
        j = get_unsigned16() % (i + 1);
        operations[i] = operations[j];
        operations[j] = i;
    }
}
#endif

In list_sort.h, it basically defines the sorting function that will be used in list_sort.c.

list_sort.h

#ifndef _LINUX_LIST_SORT_H
#define _LINUX_LIST_SORT_H
#include "list.h"

typedef int __attribute__((nonnull(2,3))) (*cmp_func)(void *,
		struct list_head const *, struct list_head const *);

__attribute__((nonnull(2,3,4)))
static struct list_head *merge(void *priv, cmp_func cmp,
				struct list_head *a, struct list_head *b);

__attribute__((nonnull(2,3,4,5)))
static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
			struct list_head *a, struct list_head *b);

__attribute__((nonnull(2,3)))
void list_sort(void *priv, struct list_head *head,
	       int (*cmp)(void *priv, struct list_head *a,
			  struct list_head *b));
#endif

The main porting operation is located in list_sort.c, which contains the main implementation of list_sort and the correctness check. The following is the main function.

main

int main(){
    struct list_head testlist;
    struct listitem *item, *is = NULL;
    size_t i;
    FILE *fp = fopen("test.txt", "w");
    random_shuffle_array(values, (uint16_t) ARRAY_SIZE(values));

    INIT_LIST_HEAD(&testlist);

    assert(list_empty(&testlist));

    for (i = 0; i < ARRAY_SIZE(values); i++) {
        item = (struct listitem *) malloc(sizeof(*item));
        assert(item);
        item->i = values[i];
        list_add_tail(&item->list, &testlist);
    }

    assert(!list_empty(&testlist));
    list_for_each_entry_safe (item, is, &testlist, list) {
        fprintf(fp, "%d ", item->i);
    }
    fprintf(fp,"\n");

    qsort(values, ARRAY_SIZE(values), sizeof(values[0]), cmpint);
    list_sort(NULL, &testlist, cmplist);

    i = 0;
    list_for_each_entry_safe (item, is, &testlist, list) {
        fprintf(fp, "%d ", item->i);
        assert(item->i == values[i]);
        list_del(&item->list);
        free(item);
        i++;
    }
    fprintf(fp, "\n");
    fclose(fp);

    assert(i == ARRAY_SIZE(values));
    assert(list_empty(&testlist));

    return 0;
}

分析

Below we first analyze the annotations in the list_sort function in the linux core.

This mergesort is as eager as possible while always performing at least 2:1 balanced merges. Given two pending sublists of size 2^k, they are merged to a size-2^(k+1) list as soon as we have 2^k following elements.
Thus, it will avoid cache thrashing as long as 3 * 2^k elements can fit into the cache. Not quite as good as a fully-eager bottom-up mergesort, but it does use 0.2*n fewer comparisons, so is faster in the common case that everything fits into L1.

list_sort() mainly optimizes the hardware space. From the above, it can be seen that the merge will be done at a ratio of 2:1, so that when the cache capacity can accommodate

32k elements can effectively avoid thrashing.

When list_sort() splits, it will put the elements to be processed into another list called pending. Since it will try to merge in a 2:1 manner in the end, if you choose to merge two

2k sublists There should also be
2k
elements in the future, so after all the lists are processed, there will be left
2k,2k1,,22,21

With such a list column, the entire list can be concatenated in a ratio of 2:1 as mentioned in the comment.

e.g.

count pending pending pending pending follow
0 1 1
1 1 1 1
2(merge) 2 1 1
3 2 1 1 1
4(merge) 2 2 1 1
5(merge) 4 1 1 1
6(merge) 4 2 1 1
7 4 2 1 1 1
8(merge) 4 2 2 1 1
9(merge) 4 4 1 1 1
10(merge) 4 4 2 1 1
11(merge) 8 2 1 1 1
12(merge) 8 2 2 1 1
13(merge) 8 4 1 1 1
14(merge) 8 4 2 1 done

If there are 15 elements will be arranged in the above form, you can see that they are also arranged in

23,22,21,20 at the end, and finally the merge from small to large is completed.

This method can solve the unbalanced problem. Although it is not as good as fully-eager bottom-up mergesort, it uses fewer comparisons (0.2 * n), so it will be faster as long as it is suitable for L1 under normal circumstances.

Test and performance comparison

Compared to the quick sort implemented by quiz1:

Let's compare the memory allocated by heap and stack during the execution period between the two.

heap

list_sort

==30058== HEAP SUMMARY:
==30058== in use at exit: 0 bytes in 0 blocks
==30058== total heap usage: 1,025,002 allocs, 1,025,002 frees, 26,628,648 bytes allocated

quicksort

==30107== HEAP SUMMARY:
==30107== in use at exit: 0 bytes in 0 blocks
==30107== total heap usage: 1,025,002 allocs, 1,025,002 frees, 26,628,648 bytes allocated

The space configured by the two is the same here, because the heap space is mainly configured by malloc, so the heap space will not differ much under the same structure and the same number of lists.

So here we focus on the performance of the stack during execution.

stack

Here we use valgrind --tool=massif --stacks=yes:

list_sort

jhan1998@jhan1998-MacBookPro:~/linux2021/quiz2$ ms_print massif.out.30375
--------------------------------------------------------------------------------
Command:            ./list_sort
Massif arguments:   --stacks=yes
ms_print arguments: massif.out.30375
--------------------------------------------------------------------------------


    KB
43.77^#                                                            :          
     |#: ::: :   :     :  :  ::::  : :: : :  :    :: :: ::::::: :: :: :: :: ::
     |#: ::: : ::::::  :: :: :: ::::::: :::: :: : @: :::@:::::@ :: :@ :: :@ ::
     |#: ::: : ::::::  :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ ::
     |#: ::: : ::::::  :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ ::
     |#: ::: : ::::::  :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ ::
     |#: ::: : ::::::: :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ ::
     |#: ::: : ::::::: :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ ::
     |#: ::: : ::::::: :: :: :: :: :::: ::::::: : @: :::@:::::@ :: :@ :: :@ ::
     |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ ::
     |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ ::
     |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ ::
     |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ ::
     |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ ::
     |#::::: : ::::::: :: :: :: :: :::: ::::::::: @: :::@:::::@ :: :@ :: :@ ::
     |#::::: : ::::::: :: :: :: :: :::::::::::::: @: :::@:::::@ :: :@ :: :@ ::
     |#::::: : ::::::: :: :: :: :: :::::::::::::: @: :::@:::::@::: :@ :: :@ ::
     |#::::: :@:::::::::: :: :: :: :::::::::::::: @: :::@:::::@::: :@ :: :@ ::
     |#::::: :@:::::::::: :: :: :: :::::::::::::: @: :::@:::::@::: :@ :: :@ ::
     |#::::: :@:::::::::: :: :: :: :::::::::::::: @: :::@:::::@::: :@ :: :@ ::
   0 +----------------------------------------------------------------------->Gi
     0                                                                   1.274

Number of snapshots: 97
 Detailed snapshots: [1 (peak), 10, 20, 23, 51, 61, 71, 81, 91]

--------------------------------------------------------------------------------
  n        time(i)         total(B)   useful-heap(B) extra-heap(B)    stacks(B)
--------------------------------------------------------------------------------
  0              0                0                0             0            0
  1        414,231           44,824           26,624        16,392        1,808
59.40% (26,624B) (heap allocation functions) malloc/new/new[], --alloc-fns, etc.
->54.83% (24,576B) 0x108F0A: main (in /home/jhan1998/linux2021/quiz2/list_sort)
| 
->04.57% (2,048B) 0x4E776DE: qsort_r (msort.c:222)
  ->04.57% (2,048B) 0x108FC0: main (in /home/jhan1998/linux2021/quiz2/list_sort)
    
--------------------------------------------------------------------------------
  n        time(i)         total(B)   useful-heap(B) extra-heap(B)    stacks(B)
--------------------------------------------------------------------------------
  2     21,329,084           44,048           26,624        16,392        1,032
  3     45,318,463           25,520           15,096        10,064          360
  4     60,925,639           43,784           26,624        16,392          768
  5     82,870,639           44,808           26,624        16,392        1,792
  6    100,113,101            2,512            1,248           832          432
  7    111,869,386           43,664           26,624        16,392          648
  8    126,238,190              440                0             0          440
  9    149,750,638           44,432           26,624        16,392        1,416
 10    170,128,108            8,568            4,920         3,280          368
--------------------------------------------------------------------------------
  n        time(i)         total(B)   useful-heap(B) extra-heap(B)    stacks(B)
--------------------------------------------------------------------------------
 92  1,321,466,004              408                0             0          408
 93  1,332,959,706           44,304           26,624        16,392        1,288
 94  1,344,453,392           41,536           24,576        16,384          576
 95  1,355,947,105           44,560           26,624        16,392        1,544
 96  1,367,440,823           41,528           24,576        16,384          568

quicksort

jhan1998@jhan1998-MacBookPro:~/linux2021/quiz2$ ms_print massif.out.30463
--------------------------------------------------------------------------------
Command:            ./qsort
Massif arguments:   --stacks=yes
ms_print arguments: massif.out.30463
--------------------------------------------------------------------------------


    KB
44.30^         ##                                                             
     |    :    #         ::       :    ::   :: :::::@:   @:::  ::  ::        :
     |@@:::  ::# ::: ::::::  :: @@:::::: ::::  : :: @::::@: ::@: : : :::: @ ::
     |@ : :  ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: : : :: : @ ::
     |@ : :  ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: : : :: ::@ ::
     |@ : :  ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: : : :: ::@ ::
     |@ : :  ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: : : :: ::@ ::
     |@ : :  ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: : : :: ::@ ::
     |@ : :  ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: : : :: ::@ ::
     |@ : :  ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
     |@ : :::::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
     |@ : :: ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
     |@ : :: ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
     |@ : :: ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
     |@ : :: ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
     |@ : :: ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
     |@ : :: ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
     |@ : :: ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
     |@ : :: ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
     |@ : :: ::# ::  : ::::  :: @ :: ::: : ::  : :: @::::@: ::@: ::: :: ::@ ::
   0 +----------------------------------------------------------------------->Gi
     0                                                                   2.076

Number of snapshots: 56
 Detailed snapshots: [1, 7 (peak), 10, 16, 20, 29, 33, 38, 42, 51]

--------------------------------------------------------------------------------
  n        time(i)         total(B)   useful-heap(B) extra-heap(B)    stacks(B)
--------------------------------------------------------------------------------
  0              0                0                0             0            0
  1     27,812,220           42,256           24,576        16,384        1,296
58.16% (24,576B) (heap allocation functions) malloc/new/new[], --alloc-fns, etc.
->58.16% (24,576B) 0x1092F5: main (in /home/jhan1998/linux2021/quiz2/qsort)
| 
->00.00% (0B) in 1+ places, all below ms_print's threshold (01.00%)

--------------------------------------------------------------------------------
  n        time(i)         total(B)   useful-heap(B) extra-heap(B)    stacks(B)
--------------------------------------------------------------------------------
  2     73,150,427           41,936           24,576        16,384          976
  3    138,499,140           44,064           24,576        16,384        3,104
  4    179,888,135           23,672           13,944         9,296          432
  5    216,949,699           42,080           24,576        16,384        1,120
  6    248,452,079           41,936           24,576        16,384          976
  7    300,748,565           45,360           24,576        16,384        4,400

--------------------------------------------------------------------------------
  n        time(i)         total(B)   useful-heap(B) extra-heap(B)    stacks(B)
--------------------------------------------------------------------------------
 52  2,146,167,569              440                0             0          440
 53  2,173,963,866           41,616           24,576        16,384          656
 54  2,201,760,176           43,264           24,576        16,384        2,304
 55  2,229,556,490           43,776           24,576        16,384        2,816

It can be seen that on average, the execution cost of quicksort will be relatively high, probably more than 1,000. As for list_sort, except for some places, it will fluctuate to more than 1,000, but the average is maintained at 4-500.


Bit field operation

Principles

uint16_t func(uint16_t N) {
    /* change all right side bits to 1 */
    N |= N >> 1;
    N |= N >> 2;
    N |= N >> 4;
    N |= N >> 8;

    return (N + 1) >> 1;
}

The key to this question is to understand the meaning of return (N + 1) >> 1, so it can be judged that (N+1) should be

2n1, like this:

#N
1111
#N+1
10000
#(N + 1) >> 1
1000

So to make all bits 1, observe the following special case of 16 bits unsigned integer:

#N 1000000000000000
#N |= N >> 1
1100000000000000
#N |= N >> 2
1111000000000000
#N |= N >> 4
1111111100000000
#N |= N >> 8
1111111111111111

Starting from n = 0, it is necessary to move 2 to the nth power of bits at a time to achieve the result that all bits are 1.

is_power_of_2

In linux/include/linux/log2.h is_power_of_2 is defined like this:

/*
  * Determine whether some value is a power of two, where zero is
  * *not* considered a power of two.
  */

static inline __attribute__((const))
bool is_power_of_2(unsigned long n)
{
return (n != 0 && ((n & (n - 1)) == 0));
}

In addition to judging non-zero, there is also judging whether n & (n - 1) is 0, for example:

1610=100002
16101=011112

1610 & (16101)=000002

Therefore, it can be judged whether n is a power of 2 or not.

round up/down power of 2

__roundup_pow_of_two

/*
 * round up to nearest power of two
 */
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
	return 1UL << fls_long(n - 1);
}

__rounddown_pow_of_two

/*
 * round down to nearest power of two
 */
static inline __attribute__((const))
unsigned long __rounddown_pow_of_two(unsigned long n)
{
	return 1UL << (fls_long(n) - 1);
}

fls_long

static inline unsigned fls_long(unsigned long l)
{
    if (sizeof(l) == 4)
        return fls(l);
    return fls64(l);
}

fls() means find last set, which means to find the last 1 counted from LSB.
e.g.
fls(0x80000000) = 32, fls(0x00000001) = 1

Therefore, __roundup_pow_of_two uses 1UL <<(fls_long(n) - 1) to make the leftmost 1 one bit to the left, and __rounddown_pow_of_two returns to the original leftmost 1 position, if it is 2 Power of , both return the same value.


bitcpy() implementation

Principle

Here I am going to analyze the code of bitcpy() step by step

    size_t read_lhs = _read & 7;
    size_t read_rhs = 8 - read_lhs;
    const uint8_t *source = (const uint8_t *) _src + (_read / 8);
    size_t write_lhs = _write & 7;
    size_t write_rhs = 8 - write_lhs;
    uint8_t *dest = (uint8_t *) _dest + (_write / 8);

At the beginning, this is mainly to prepare for reading data later, and handle the offset position, offset and alignment.

uint8_t data = *source++;
size_t bitsize = (count > 8) ? 8 : count;
if (read_lhs > 0) {
    data <<= read_lhs;
    if (bitsize > read_rhs)
        data |= (*source >> read_rhs);
}

Here is to read from the specified source location, from size_t bitsize = (count > 8) ? 8 : count;, you can know that only one byte of data is processed each time, first judge whether read_lhs > 0 is In order to confirm whether the beginning of the data to be read is aligned with the beginning of the source, if it is non-zero, the data that has been assigned in advance must be shifted to the left, so as to align with the correct beginning of _read, and then because it is moved to the next One byte area, so also shift read_rhs bits to the right so as to ensure to get 8 consecutive bits.

The next step is to judge, how many bits should be taken in total, and then the unnecessary data should be eliminated for the mask.

if (bitsize < 8)
    data &= read_mask[bitsize];

write part:

uint8_t original = *dest;
uint8_t mask = read_mask[write_lhs];
if (bitsize > write_rhs) {
            /* Cross multiple bytes */
    *dest++ = (original & mask) | (data >> write_lhs);
    original = *dest & write_mask[bitsize - write_rhs];
    *dest = original | (data << write_rhs);
} else {
            // Since write_lhs + bitsize is never >= 8, no out-of-bound access.
    mask |= write_mask[write_lhs + bitsize];
    *dest++ = (original & mask) | (data >> write_lhs);
}

First set the mask and set the reserved data. If bitsize > write_rh, it means that he needs to write the data in two stages, where original & mask means to clear the bits to be saved, and then save 8 - lhs The right half of the bits range, original = *dest & write_mask[bitsize - write_rhs]; This step is also to clear the bits to be stored, and finally move to the left to store the remaining bits. .

In the else part, since it can be stored in mask |= write_mask[write_lhs + bitsize]; at one time, this step is mainly to judge which write bits are to be written and to be cleared. Finally, it is the same as above, move right to the beginning The bits are stored in


Cstring operation

Principle

From the test function:

static void test_cstr()
{
    CSTR_BUFFER(a);
    cstr_cat(a, "Hello ");
    cstr_cat(a, "string");
    cstring b = cmp(CSTR_S(a));
    printf("%s\n", b->cstr);
    CSTR_CLOSE(a);
    cstr_release(b);
}

First CSTR_BUFFER() will first create an array of CSTR_STACK_SIZE to put the string, and then initialize a __cstr_data object to let the cstr inside point to the above space, and then create a cstr_buffer so that the cstring str inside can point to the above space __cstr_data, here because cstring is struct __cstr_data * but the above is declared and initialized with struct __cstr_data, so it is var->str = &var##_cstr_data;.

#define CSTR_BUFFER(var) \
    char var##_cstring[CSTR_STACK_SIZE] = {0}; \
    struct __cstr_data var##_cstr_data = {var##_cstring, 0, CSTR_ONSTACK, 0}; \
    cstr_buffer var; \
    var->str = &var##_cstr_data;

Next into cstr_cat():

cstring cstr_cat(cstr_buffer sb, const char *str)
{
    cstring s = sb->str;
    if (s->type == CSTR_ONSTACK) {
        int i = s->hash_size;
        while (i < CSTR_STACK_SIZE - 1) {
            s->cstr[i] = *str;
            if (*str == 0)
                return s;
            ++s->hash_size;
            ++str;
            ++i;
        }
        s->cstr[i] = 0;
    }
    cstring tmp = s;
    sb->str = cstr_cat2(tmp->cstr, str);
    cstr_release(tmp);
    return sb->str;
}

cstr_cat() will add the string to the specified cstr_buffer, then there will be two situations:

  1. The size is smaller than CSTR_STACK_SIZE, so as long as it is directly stored in the buffer inside the cstring, it will be fine.
  2. If the length is greater than or equal to CSTR_STACK_SIZE, it will enter cstr_cat()
static cstring cstr_cat2(const char *a, const char *b)
{
    size_t sa = strlen(a), sb = strlen(b);
    if (sa + sb < CSTR_INTERNING_SIZE) {
        char tmp[CSTR_INTERNING_SIZE];
        memcpy(tmp, a, sa);
        memcpy(tmp + sa, b, sb);
        tmp[sa + sb] = 0;
        return cstr_interning(tmp, sa + sb, hash_blob(tmp, sa + sb));
    }
    cstring p = xalloc(sizeof(struct __cstr_data) + sa + sb + 1);
    if (!p)
        return NULL;

    char *ptr = (char *) (p + 1);
    p->cstr = ptr;
    p->type = 0;
    p->ref = 1;
    memcpy(ptr, a, sa);
    memcpy(ptr + sa, b, sb);
    ptr[sa + sb] = 0;
    p->hash_size = 0;
    return p;
}

cstr_cat2() will first try to intern string. When the length is less than CSTR_INTERNING_SIZE, if it is larger, it will give him a place in the heap to put the string in. At this time, because the length is too long, the entire string will exist in the entire structure Later, set type and hash_size to 0 and ref to 1.

The strange thing here is that CSTR_INTERNING_SIZE is 32 and CSTR_STACK_SIZE is 128, so str_cat2() should be a string longer than CSTR_STACK_SIZE, so it will never enter the intern?

Then, look at cstr_interning() again:

static cstring cstr_interning(const char *cstr, size_t sz, uint32_t hash)
{
    cstring ret;
    CSTR_LOCK();
    ret = interning(&__cstr_ctx, cstr, sz, hash);
    if (!ret) {
        expand(&__cstr_ctx);
        ret = interning(&__cstr_ctx, cstr, sz, hash);
    }
    ++__cstr_ctx.total;
    CSTR_UNLOCK();
    return ret;
}

Here, in order to prevent multiple execution sequences from interning cstr_ctx at the same time, there are lock protection measures. After entering interning, if the hash_table has not been established or the space is insufficient, expand(&__cstr_ctx) will be used to increase the space.

static cstring interning(struct __cstr_interning *si,
                          const char *cstr,
                          size_t sz,
                          uint32_t hash)
{
     if (!si->hash)
         return NULL;

At the beginning, it detects whether hash_table has been created, if not, return NULL and then hand it over to expand() to initialize the table.

    int index = (int) (hash & (si->size - 1));
    struct __cstr_node *n = si->hash[index];
    while (n) {
        if (n->str.hash_size == hash) {
            if (!strcmp(n->str.cstr, cstr))
                return &n->str;
        }
        n = n->next;
    }

Calculate the index value, the fields with the same index will be connected by a linked list, then confirm the hash_size and then confirm that the strings are not equal, and return if found.

    // 80% (4/5) threshold
    if (si->total * 5 >= si->size * 4)
        return NULL;
    if (!si->pool) {
        si->pool = xalloc(sizeof(struct __cstr_pool));
        si->index = 0;
    }

Because it is not found here, it must be stored in hash_table, so first check whether hash_table has reached 80% capacity, if it exceeds, expand(), and if the pool has not been initialized, it will allocate a space to node, so It can avoid frequent malloc and make locality better.

    n = &si->pool->node[si->index++];
    memcpy(n->buffer, cstr, sz);
    n->buffer[sz] = 0;

    cstring cs = &n->str;
    cs->cstr = n->buffer;
    cs->hash_size = hash;
    cs->type = CSTR_INTERNING;
    cs->ref = 0;

    n->next = si->hash[index];
    si->hash[index] = n;

    return cs;
}

The last is to update the various types in the string, and store them in the hash_table where the ref is set to 0, which means that the intern string will not be released until the end.

Then enter the cmp() part:

static cstring cmp(cstring t)
{
     CSTR_LITERAL(hello, "Hello string");
     CSTR_BUFFER(ret);
     cstr_cat(ret, cstr_equal(hello, t) ? "equal" : "not equal");
     return cstr_grab(CSTR_S(ret));
}

CSTR_LITERAL will give a static string to give a value through clone, here __sync_bool_compare_and_swap(&var, NULL, tmp) is to prevent simultaneous execution of CSTR_LITERAL() in the case of multiple execution sequences, which will give two Space, so use __sync_bool_compare_and_swap(&var, NULL, tmp) to judge whether the value of var has been updated, if not, update and return true, so that subsequent execution sequences will return false.

#define CSTR_LITERAL(var, cstr)                                               \
    static cstring var = NULL;                                                \
    if (!var) {                                                               \
        cstring tmp = cstr_clone("" cstr, (sizeof(cstr) / sizeof(char)) - 1); \
        if (tmp->type == 0) {                                                 \
            tmp->type = CSTR_PERMANENT;                                       \
            tmp->ref = 0;                                                     \
        }                                                                     \
        if (!__sync_bool_compare_and_swap(&var, NULL, tmp)) {                 \
            if (tmp->type == CSTR_PERMANENT)                                  \
                free(tmp);                                                    \
        }                                                                     \
    }

cstr_clone() is quite similar to cstr_cat(), first determine the size before choosing whether to intern or configure it on the heap. The contradiction here is that if the string is too long, the ref will be set to 1, but when it is returned, it will be Literal It will set him back to 0, quite redundant.

cstring cstr_clone(const char *cstr, size_t sz)
{
    if (sz < CSTR_INTERNING_SIZE)
        return cstr_interning(cstr, sz, hash_blob(cstr, sz));
    cstring p = xalloc(sizeof(struct __cstr_data) + sz + 1);
    if (!p)
        return NULL;
    void *ptr = (void *) (p + 1);
    p->cstr = ptr;
    p->type = 0;
    p->ref = 1;
    memcpy(ptr, cstr, sz);
    ((char *) ptr)[sz] = 0;
    p->hash_size = 0;
    return p;
}

Finally, let's see the part of cstr_equal() and cstr_grab():

int cstr_equal(cstring a, cstring b)
{
    if (a == b)
        return 1;
    if ((a->type == CSTR_INTERNING) && (b->type == CSTR_INTERNING))
        return 0;
    if ((a->type == CSTR_ONSTACK) && (b->type == CSTR_ONSTACK)) {
        if (a->hash_size != b->hash_size)
            return 0;
        return memcmp(a->cstr, b->cstr, a->hash_size) == 0;
    }
    uint32_t hasha = cstr_hash(a);
    uint32_t hashb = cstr_hash(b);
    if (hasha != hashb)
        return 0;
    return !strcmp(a->cstr, b->cstr);
}

cstr_equal will calculate whether a and b point to the same object. This is to test whether it is a string in intern, so if the type is both CSTR_INTERNING, then a and b must not be the same string, and then If both are in the stack, because if the string is recorded in stck, the length of the string will be recorded, so the string will be compared first, and then whether the content is the same, and finally if the string is too long or the type is different, first The calculated hash value is different, and the strings are compared.

cstring cstr_grab(cstring s)
{
    if (s->type & (CSTR_PERMANENT | CSTR_INTERNING))
        return s;
    if (s->type == CSTR_ONSTACK)
        return cstr_clone(s->cstr, s->hash_size);
    if (s->ref == 0)
        s->type = CSTR_PERMANENT;
    else
        __sync_add_and_fetch(&s->ref, 1);
    return s;
}

grab will return the cstring before returning it, it will judge the type if it is in the stack, enter the clone and throw it in the intern, and finally if the ref is not 0, then +1.