2021q1 Homework2 (quiz2)

contributed by < ccs100203 >

tags: `linux2021`

第 2 週測驗題

測驗 1

以下是針對 doubly-linked list 的 merge sort 實作:

#include <string.h>

typedef struct __element {
    char *value;
    struct __element *next;
    struct list_head list;
} list_ele_t;

typedef struct {
    list_ele_t *head; /* Linked list of elements */
    list_ele_t *tail;
    size_t size;
    struct list_head list;
} queue_t;

static list_ele_t *get_middle(struct list_head *list)
{
    struct list_head *fast = list->next, *slow;
    list_for_each (slow, list) {
        if (fast->next == list || fast->next->next == list)
            break;
        fast = fast->next->next;
    }
    return list_entry(slow, list_ele_t, list);
}

static void list_merge(struct list_head *lhs,
                       struct list_head *rhs,
                       struct list_head *head)
{
    INIT_LIST_HEAD(head);
    if (list_empty(lhs)) {
        list_splice_tail(lhs, head);
        return;
    }
    if (list_empty(rhs)) {
        list_splice_tail(rhs, head);
        return;
    }

    while (!list_empty(lhs) && !list_empty(rhs)) {
        char *lv = list_entry(lhs->next, list_ele_t, list)->value;
        char *rv = list_entry(rhs->next, list_ele_t, list)->value;
        struct list_head *tmp = strcmp(lv, rv) <= 0 ? lhs->next : rhs->next;
        list_del(tmp);
        list_add_tail(tmp, head);
    }
    list_splice_tail(list_empty(lhs) ? rhs : lhs, head);
}

void list_merge_sort(queue_t *q)
{
    if (list_is_singular(&q->list))
        return;

    queue_t left;
    struct list_head sorted;
    INIT_LIST_HEAD(&left.list);
    list_cut_position(&left.list, &q->list, &get_middle(&q->list)->list);
    list_merge_sort(&left);
    list_merge_sort(q);
    list_merge(&left.list, &q->list, &sorted);
    INIT_LIST_HEAD(&q->list);
    list_splice_tail(&sorted, &q->list);
}

解釋程式原理

參考 Merge sort

程式會藉由 get_middle 取得該 list 的中間點，再透過 list_cut_position 做切割，然後重複的遞迴呼叫。當所有節點都分開後會經由 list_merge 將其排序以及結合。

`get_middle`

此函式的目的是找出 list 的中間點

/**
 * list_for_each - iterate over list nodes
 * @node: list_head pointer used as iterator
 * @head: pointer to the head of the list
 */
#define list_for_each(node, head) \
    for (node = (head)->next; node != (head); node = node->next)

static list_ele_t *get_middle(struct list_head *list)
{
    struct list_head *fast = list->next, *slow;
    list_for_each (slow, list) {
        if (fast->next == list || fast->next->next == list)
            break;
        fast = fast->next->next;
    }
    return list_entry(slow, list_ele_t, list);
}

下面的圖表只繪製出 next 指標

首先，會分別用 fast、slow 指標指向 list 的前兩個 node

再來進入了 list_for_each 所定義的迴圈內，一開始的賦值階段會先將 slow 指向下一個節點

在 if 的判斷通過之前，迴圈會讓 slow 不斷指向下一個節點，而 fast 則會指向下下個節點

此時，fast->next->next == list 的判斷式會成立
可以看出 slow 會位於 list 中間的節點，所以藉著 slow 就可以將 A、B、C 與 D、E、F、G 分開。

`list_cut_position`

此函式用來將一條 list 從特定的點切分成兩條 list。
他會將 head_from 之後一直到 node 之間的節點接到 head_to。
舉例來說:

                node
                  |
                  V
HEAD->N1->N2->N3->N4->N5->N6->N7->N8
LEFT

最終情況會變成下列兩條 list

HEAD->N5->N6->N7->N8
LEFT->N1->N2->N3->N4

下面是程式碼以及更詳盡的圖解




























/**
 * list_cut_position() - Move beginning of a list to another list
 * @head_to: pointer to the head of the list which receives nodes
 * @head_from: pointer to the head of the list
 * @node: pointer to the node in which defines the cutting point
 */
static inline void list_cut_position(struct list_head *head_to,
                                     struct list_head *head_from,
                                     struct list_head *node)
{
    struct list_head *head_from_first = head_from->next;

    if (list_empty(head_from))
        return;

    if (head_from == node) {
        INIT_LIST_HEAD(head_to);
        return;
    }

    head_from->next = node->next;
    head_from->next->prev = head_from;

    head_to->prev = node;
    node->next = head_to;
    head_to->next = head_from_first;
    head_to->next->prev = head_to;
}

給定剛進入函式時的情況

執行完 Line 21, 22


head_from->next = node->next;
head_from->next->prev = head_from;

執行完 Line 24, 25


head_to->prev = node;
node->next = head_to;

執行完 Line 26, 27


head_to->next = head_from_first;
head_to->next->prev = head_to;

測驗 2

考慮函式 func 接受一個 16 位元無號整數

N

，並回傳小於或等於

N

的 power-of-2 (漢字可寫作為 2 的冪)









uint16_t func(uint16_t N) {
    /* change all right side bits to 1 */
    N |= N >> 1;
    N |= N >> 2;
    N |= N >> 4;
    N |= N >> 8;

    return (N + 1) >> 1;
}

先考慮一個數字如何轉換為 power of 2，以一個隨意的 16 位元無號整數來說，只要保留最高位的 1 並將其他都轉換為 0 即可。
所以先將比最高位還低的位元全部轉換為 1，這樣就可以產生出
$00. . .0011 . . .111$ 的數字，此時只需要將這個數字 + 1，再將他往右位移，就可以得到 power of 2。
以 1025 為例，可以看到透過前面的幾行會將較低位元的 bit 全數轉換為 1，再透過第八行將他轉換為 power of 2。

| after
execution | 15
MSB | 14~11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0
LSB |
| - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - | - |
| Line 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| Line 3 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| Line 4 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| Line 5 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 1 |
| Line 6 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| Line 8 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |

查閱 Linux 核心原始程式碼 power of 2

根據 log2.h

is_power_of_2

/**
 * is_power_of_2() - check if a value is a power of two
 * @n: the value to check
 *
 * Determine whether some value is a power of two, where zero is
 * *not* considered a power of two.
 * Return: true if @n is a power of 2, otherwise false.
 */
static inline __attribute__((const))
bool is_power_of_2(unsigned long n)
{
	return (n != 0 && ((n & (n - 1)) == 0));
}

已知如果一個數字只有任一且唯一一個 bit 為 1，其餘為 0，那麼他就屬於 power of 2。
(n & (n - 1) 的操作，其實就是將最低位的 bit 設為 0。
假設 n 為
$11000$ ， n - 1 就是
$10111$ ，此時做 and 運算就可以將最底位的 bit 設為 0。
也藉由這個特性，跟上述已知道的條件，所以在做完 (n & (n - 1) 後，若數字變為 0，則代表其為 power of 2。

`__roundup_pow_of_two` 和 `__rounddown_pow_of_two` 解析

__roundup_pow_of_two 是為了將數字無條件進位到比較大的 power of 2，反之則為 __rounddown_pow_of_two。

以
$24$ 舉例:
- __roundup_pow_of_two(24) 應得到 32
- __rounddown_pow_of_two(24) 應得到 16

觀察以下程式碼，fls_long 的作用是 find leading set，也就是找出 bit 中最高位的 1。

已知 fls_long(24) 會得到 5
- __roundup_pow_of_two
  只要再藉由 1U << 5 就能得到 32，那為什麼需要 n-1 呢?
  假設今天不是 24，而是 32 這種屬於 power of 2 的數字時，程式是不需要做進位的，但是直接做 fls_long(32) 會得到 6，代表程式在不需要進位時還是會進位。所以需要藉由 n-1 去避免掉程式多做了進位的情況。
- __rounddown_pow_of_two
  由於是要無條件捨去，所以可以很直觀的將 fls_long(n) 的結果 - 1，也就是只保留了原先數字中最高位為 1 的 bit。

/**
 * __roundup_pow_of_two() - round up to nearest power of two
 * @n: value to round up
 */
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
	return 1UL << fls_long(n - 1);
}

/**
 * __rounddown_pow_of_two() - round down to nearest power of two
 * @n: value to round down
 */
static inline __attribute__((const))
unsigned long __rounddown_pow_of_two(unsigned long n)
{
	return 1UL << (fls_long(n) - 1);
}

如何 find leading set

由於上述的程式用到了 find leading set 的功能，所以來解釋他的做法。首先可以看到 fls_long 只是藉由不同的 type size 去呼叫不同的 fls。

fls_long

static inline unsigned fls_long(unsigned long l)
{
	if (sizeof(l) == 4)
		return fls(l);
	return fls64(l);
}

這邊只擷取了某一部分的情況，可以看到 find leading set 是藉由 number of bits 減 count leading zero 做出來的，而 __kernel_ctlz 又會藉 GCC Built in function 裡的 __builtin_clzl 去實作。
透過這些函式的串聯，就可以知道 fls_long 的功用與作法。

fls && fls64

static inline int fls64(unsigned long word)
{
	return 64 - __kernel_ctlz(word);
}

static inline int fls(unsigned int x)
{
	return fls64(x);
}

`__kernel_ctlz(x)`

#  define __kernel_ctlz(x)	__builtin_clzl(x)

參考 fls_long
參考 fls
參考 __kernel_ctlz
參考 Other Built-in Functions Provided by GCC

`attribute((const))` 用處

參考 Declaring Attributes of Functions
Basically this is just slightly more strict class than the pure attribute below, since function is not allowed to read global memory.
Note that a function that has pointer arguments and examines the data pointed to must not be declared const. Likewise, a function that calls a non-const function usually must not be const. It does not make sense for a const function to return void.

規定 function 不能夠含有指標相關的參數以及調動到 global memory，同時也要求其必須有 void 以外的回傳值。

換言之，__attribute__((const)) 讓 C function (函式) 限縮到數學意義上的 function (函數)，這也是為何中文翻譯應該區分「函式」和「函數」

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

測驗 3

以下是 bitcpy 實作，允許開發者對指定的記憶體區域，逐 bit 進行資料複製:
(避免篇幅過長故隱藏)



































































#include <stdint.h>

void bitcpy(void *_dest,      /* Address of the buffer to write to */
            size_t _write,    /* Bit offset to start writing to */
            const void *_src, /* Address of the buffer to read from */
            size_t _read,     /* Bit offset to start reading from */
            size_t count)
{
    size_t read_lhs = _read & 7;
    size_t read_rhs = 8 - read_lhs;
    const uint8_t *source = (const uint8_t *) _src + (_read / 8);
    size_t write_lhs = _write & 7;
    size_t write_rhs = 8 - write_lhs;
    uint8_t *dest = (uint8_t *) _dest + (_write / 8);

    static const uint8_t read_mask[] = {
        0x00, /*    == 0    00000000b   */
        0x80, /*    == 1    10000000b   */
        0xC0, /*    == 2    11000000b   */
        0xE0, /*    == 3    11100000b   */
        0xF0, /*    == 4    11110000b   */
        0xF8, /*    == 5    11111000b   */
        0xFC, /*    == 6    11111100b   */
        0xFE, /*    == 7    11111110b   */
        0xFF  /*    == 8    11111111b   */
    };

    static const uint8_t write_mask[] = {
        0xFF, /*    == 0    11111111b   */
        0x7F, /*    == 1    01111111b   */
        0x3F, /*    == 2    00111111b   */
        0x1F, /*    == 3    00011111b   */
        0x0F, /*    == 4    00001111b   */
        0x07, /*    == 5    00000111b   */
        0x03, /*    == 6    00000011b   */
        0x01, /*    == 7    00000001b   */
        0x00  /*    == 8    00000000b   */
    };

    while (count > 0) {
        uint8_t data = *source++;
        size_t bitsize = (count > 8) ? 8 : count;
        if (read_lhs > 0) {
            data <<= read_lhs;
            if (bitsize > read_rhs)
                data |= (*source >> read_rhs);
        }

        if (bitsize < 8)
            data &= read_mask[bitsize];

        uint8_t original = *dest;
        uint8_t mask = read_mask[write_lhs];
        if (bitsize > write_rhs) {
            /* Cross multiple bytes */
            *dest++ = (original & mask) | (data >> write_lhs);
            original = *dest & write_mask[bitsize - write_rhs];
            *dest = original | (data << write_rhs);
        } else {
            // Since write_lhs + bitsize is never >= 8, no out-of-bound access.
            mask |= write_mask[write_lhs + bitsize];
            *dest++ = (original & mask) | (data >> write_lhs);
        }

        count -= bitsize;
    }
}

透過 read_lhs 與 read_rhs 可以知道資料所需的偏移量。而 write 也是使用了相同的手段。
而 _read / 8 可以得知要從第幾個 byte 開始讀取。






size_t read_lhs = _read & 7;
size_t read_rhs = 8 - read_lhs;
const uint8_t *source = (const uint8_t *) _src + (_read / 8);
size_t write_lhs = _write & 7;
size_t write_rhs = 8 - write_lhs;
uint8_t *dest = (uint8_t *) _dest + (_write / 8);

限制每次最多處理 8 bits。
並且藉由 bitsize 與 read_rhs 的比較去判斷此次運算是否需要跨越到下一個 byte。
如果需要的話代表 data 需要加入下一個 byte 內最高位開始的 read_rhs 個 bit。






size_t bitsize = (count > 8) ? 8 : count;
if (read_lhs > 0) {
    data <<= read_lhs;
    if (bitsize > read_rhs)
        data |= (*source >> read_rhs);
}

而 read_mask 與 write_mask 的用處就在於保證程式只更改 / 更新到所需要的 bits。

先解釋程式如何進行寫入:

如果需要寫入最低位的 3 個 bit，則會產生下列的 mask:

再來利用 *dest = (*dest & mask) | (data >> write_lhs);
(*dest & mask) 可以保留不須更改的高位 5 個 bits。
(data >> write_lhs) 則是用來取出需要寫入的 bits。
將兩者做 bitwise or 運算就可以完成預期中寫入的動作。

程式碼寫入實作說明
如果不需要跨位元組寫入，透過 mask |= write_mask[write_lhs + bitsize]; 會將需要寫入的 bits 設為 0，而其餘設為 1，就可以透過下一行的操作在不更動其餘資料的狀況下將資料填入。
如果需要跨位元組寫入，第 60 行的會先寫入第一個 byte，利用 mask 將其操作限制在最右邊幾個所需的 bits。而下面兩行則是利用 write_mask 將需要寫入的幾個高位元的 bit 設為 0，其餘不變的則為 0，藉此完成寫入的操作。















if (bitsize < 8)
    data &= read_mask[bitsize];

uint8_t original = *dest;
uint8_t mask = read_mask[write_lhs];
if (bitsize > write_rhs) {
    /* Cross multiple bytes */
    *dest++ = (original & mask) | (data >> write_lhs);
    original = *dest & write_mask[bitsize - write_rhs];
    *dest = original | (data << write_rhs);
} else {
    // Since write_lhs + bitsize is never >= 8, no out-of-bound access.
    mask |= write_mask[write_lhs + bitsize];
    *dest++ = (original & mask) | (data >> write_lhs);
}

TODO 研究 linux kernel bitmaps

測驗 4

參考 String interning
In computer science, string interning is a method of storing only one copy of each distinct string value, which must be immutable. Interning strings makes some string processing tasks more time- or space-efficient at the cost of requiring more time when the string is created or interned.
以及 Immutable object
In object-oriented and functional programming, an immutable object (unchangeable object) is an object whose state cannot be modified after it is created.

String interning 是將出現多處的字串，只保存在單一的儲存空間，存取時直接讀取 reference，藉此節省運算與儲存成本。

以下是 string interning 在 C 語言中的簡易實作:

(避免篇幅過長故隱藏)

cstr.h



























































#pragma once
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>

enum {
    CSTR_PERMANENT = 1,
    CSTR_INTERNING = 2,
    CSTR_ONSTACK = 4,
};

#define CSTR_INTERNING_SIZE (32)
#define CSTR_STACK_SIZE (128)

typedef struct __cstr_data {
    char *cstr;
    uint32_t hash_size;
    uint16_t type;
    uint16_t ref;
} * cstring;

typedef struct __cstr_buffer {
    cstring str;
} cstr_buffer[1];

#define CSTR_S(s) ((s)->str)

#define CSTR_BUFFER(var)                                                      \
    char var##_cstring[CSTR_STACK_SIZE] = {0};                                \
    struct __cstr_data var##_cstr_data = {var##_cstring, 0, CSTR_ONSTACK, 0}; \
    cstr_buffer var;                                                          \
    var->str = &var##_cstr_data;

#define CSTR_LITERAL(var, cstr)                                               \
    static cstring var = NULL;                                                \
    if (!var) {                                                               \
        cstring tmp = cstr_clone("" cstr, (sizeof(cstr) / sizeof(char)) - 1); \
        if (tmp->type == 0) {                                                 \
            tmp->type = CSTR_PERMANENT;                                       \
            tmp->ref = 0;                                                     \
        }                                                                     \
        if (!__sync_bool_compare_and_swap(&var, NULL, tmp)) {                 \
            if (tmp->type == CSTR_PERMANENT)                                  \
                free(tmp);                                                    \
        }                                                                     \
    }

#define CSTR_CLOSE(var)               \
    do {                              \
        if (!(var)->str->type)        \
            cstr_release((var)->str); \
    } while (0)

/* Public API */
cstring cstr_grab(cstring s);
cstring cstr_clone(const char *cstr, size_t sz);
cstring cstr_cat(cstr_buffer sb, const char *str);
int cstr_equal(cstring a, cstring b);
void cstr_release(cstring s);

cstr.c
































































































































































































































































#include <errno.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>

#include "cstr.h"

#define INTERNING_POOL_SIZE 1024

#define HASH_START_SIZE 16 /* must be power of 2 */

struct __cstr_node {
    char buffer[CSTR_INTERNING_SIZE];
    struct __cstr_data str;
    struct __cstr_node *next;
};

struct __cstr_pool {
    struct __cstr_node node[INTERNING_POOL_SIZE];
};

struct __cstr_interning {
    int lock;
    int index;
    unsigned size;
    unsigned total;
    struct __cstr_node **hash;
    struct __cstr_pool *pool;
};

static struct __cstr_interning __cstr_ctx;

/* FIXME: use C11 atomics */
#define CSTR_LOCK()                                               \
    ({                                                            \
        while (__sync_lock_test_and_set(&(__cstr_ctx.lock), 1)) { \
        }                                                         \
    })
#define CSTR_UNLOCK() ({ __sync_lock_release(&(__cstr_ctx.lock)); })

static void *xalloc(size_t n)
{
    void *m = malloc(n);
    if (!m)
        exit(-1);
    return m;
}

static inline void insert_node(struct __cstr_node **hash,
                               int sz,
                               struct __cstr_node *node)
{
    uint32_t h = node->str.hash_size;
    int index = h & (sz - 1);
    node->next = hash[index];
    hash[index] = node;
}

static void expand(struct __cstr_interning *si)
{
    unsigned new_size = si->size * 2;
    if (new_size < HASH_START_SIZE)
        new_size = HASH_START_SIZE;

    struct __cstr_node **new_hash =
        xalloc(sizeof(struct __cstr_node *) * new_size);
    memset(new_hash, 0, sizeof(struct __cstr_node *) * new_size);

    for (unsigned i = 0; i < si->size; ++i) {
        struct __cstr_node *node = si->hash[i];
        while (node) {
            struct __cstr_node *tmp = node->next;
            insert_node(new_hash, new_size, node);
            node = tmp;
        }
    }

    free(si->hash);
    si->hash = new_hash;
    si->size = new_size;
}

static cstring interning(struct __cstr_interning *si,
                         const char *cstr,
                         size_t sz,
                         uint32_t hash)
{
    if (!si->hash)
        return NULL;

    int index = (int) (hash & (si->size - 1));
    struct __cstr_node *n = si->hash[index];
    while (n) {
        if (n->str.hash_size == hash) {
            if (!strcmp(n->str.cstr, cstr))
                return &n->str;
        }
        n = n->next;
    }
    // 80% (4/5) threshold
    if (si->total * 5 >= si->size * 4)
        return NULL;
    if (!si->pool) {
        si->pool = xalloc(sizeof(struct __cstr_pool));
        si->index = 0;
    }
    n = &si->pool->node[si->index++];
    memcpy(n->buffer, cstr, sz);
    n->buffer[sz] = 0;

    cstring cs = &n->str;
    cs->cstr = n->buffer;
    cs->hash_size = hash;
    cs->type = CSTR_INTERNING;
    cs->ref = 0;

    n->next = si->hash[index];
    si->hash[index] = n;

    return cs;
}

static cstring cstr_interning(const char *cstr, size_t sz, uint32_t hash)
{
    cstring ret;
    CSTR_LOCK();
    ret = interning(&__cstr_ctx, cstr, sz, hash);
    if (!ret) {
        expand(&__cstr_ctx);
        ret = interning(&__cstr_ctx, cstr, sz, hash);
    }
    ++__cstr_ctx.total;
    CSTR_UNLOCK();
    return ret;
}

static inline uint32_t hash_blob(const char *buffer, size_t len)
{
    const uint8_t *ptr = (const uint8_t *) buffer;
    size_t h = len;
    size_t step = (len >> 5) + 1;
    for (size_t i = len; i >= step; i -= step)
        h = h ^ ((h << 5) + (h >> 2) + ptr[i - 1]);
    return h == 0 ? 1 : h;
}

cstring cstr_clone(const char *cstr, size_t sz)
{
    if (sz < CSTR_INTERNING_SIZE)
        return cstr_interning(cstr, sz, hash_blob(cstr, sz));
    cstring p = xalloc(sizeof(struct __cstr_data) + sz + 1);
    if (!p)
        return NULL;
    void *ptr = (void *) (p + 1);
    p->cstr = ptr;
    p->type = 0;
    p->ref = 1;
    memcpy(ptr, cstr, sz);
    ((char *) ptr)[sz] = 0;
    p->hash_size = 0;
    return p;
}

cstring cstr_grab(cstring s)
{
    if (s->type & (CSTR_PERMANENT | CSTR_INTERNING))
        return s;
    if (s->type == CSTR_ONSTACK)
        return cstr_clone(s->cstr, s->hash_size);
    if (s->ref == 0)
        s->type = CSTR_PERMANENT;
    else
        __sync_add_and_fetch(&s->ref, 1);
    return s;
}

void cstr_release(cstring s)
{
    if (s->type || !s->ref)
        return;
    if (__sync_sub_and_fetch(&s->ref, 1) == 0)
        free(s);
}

static size_t cstr_hash(cstring s)
{
    if (s->type == CSTR_ONSTACK)
        return hash_blob(s->cstr, s->hash_size);
    if (s->hash_size == 0)
        s->hash_size = hash_blob(s->cstr, strlen(s->cstr));
    return s->hash_size;
}

int cstr_equal(cstring a, cstring b)
{
    if (a == b)
        return 1;
    if ((a->type == CSTR_INTERNING) && (b->type == CSTR_INTERNING))
        return 0;
    if ((a->type == CSTR_ONSTACK) && (b->type == CSTR_ONSTACK)) {
        if (a->hash_size != b->hash_size)
            return 0;
        return memcmp(a->cstr, b->cstr, a->hash_size) == 0;
    }
    uint32_t hasha = cstr_hash(a);
    uint32_t hashb = cstr_hash(b);
    if (hasha != hashb)
        return 0;
    return !strcmp(a->cstr, b->cstr);
}

static cstring cstr_cat2(const char *a, const char *b)
{
    size_t sa = strlen(a), sb = strlen(b);
    if (sa + sb < CSTR_INTERNING_SIZE) {
        char tmp[CSTR_INTERNING_SIZE];
        memcpy(tmp, a, sa);
        memcpy(tmp + sa, b, sb);
        tmp[sa + sb] = 0;
        return cstr_interning(tmp, sa + sb, hash_blob(tmp, sa + sb));
    }
    cstring p = xalloc(sizeof(struct __cstr_data) + sa + sb + 1);
    if (!p)
        return NULL;

    char *ptr = (char *) (p + 1);
    p->cstr = ptr;
    p->type = 0;
    p->ref = 1;
    memcpy(ptr, a, sa);
    memcpy(ptr + sa, b, sb);
    ptr[sa + sb] = 0;
    p->hash_size = 0;
    return p;
}

cstring cstr_cat(cstr_buffer sb, const char *str)
{
    cstring s = sb->str;
    if (s->type == CSTR_ONSTACK) {
        int i = s->hash_size;
        while (i < CSTR_STACK_SIZE - 1) {
            s->cstr[i] = *str;
            if (*str == 0)
                return s;
            ++s->hash_size;
            ++str;
            ++i;
        }
        s->cstr[i] = 0;
    }
    cstring tmp = s;
    sb->str = cstr_cat2(tmp->cstr, str);
    cstr_release(tmp);
    return sb->str;
}

TODO