2021q1 Homework3 (quiz3)

contributed by < Julian-Chu >

tags: `linux2021`

GitHub

struct

xs union 定義 16 個位元組，應用與實作細節如下:

字串長度小於或等於 15 個位元組，則放在 stack。字串長度大於或等於 16 個位元組，則放在 heap (透過 malloc 系統呼叫配置所需字串大小)

放在 heap 的字串, 還有額外的判斷, 針對長度大於LARGE_STRING_LEN 的字串會進行複用, 同時將字串前 4 bytes 用於 reference count

需要解釋為何 reference counting 被納入考量

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

針對相同的長字串, 設計上利用參考指向同一塊記憶體空間, 避免複製浪費空間, 在沒有 reference count 的情況無法得知是否還有使用到這個字串的地方, 誤釋放記憶體空間會造成 dangling pointer 的風險。

union 與 bitfield的綜合使用, 對 c 語言的彈性跟記憶體空間利用效率真是大開眼界, 同一段記憶體空間可以用來表達不同的結構的同時又可以共用結構。

typedef union {    
    /* allow strings up to 15 bytes to stay on the stack    
     * use the last byte as a null terminator and to store flags    
     * much like fbstring:    
     * https://github.com/facebook/folly/blob/master/folly/docs/FBString.md    
     */    
    char data[16];    
    
    struct {    
        uint8_t filler[15],                                                                                                                                                            
            /* how many free bytes in this stack allocated string    
             * same idea as fbstring    
             */    
            space_left : 4,    
            /* if it is on heap, set to 1 */    
            is_ptr : 1, is_large_string : 1, flag2 : 1, flag3 : 1;    
    };    
    
    /* heap allocated */    
    struct {    
        char *ptr;    
        /* supports strings up to 2^MAX_STR_LEN_BITS - 1 bytes */    
        size_t size : MAX_STR_LEN_BITS,    
                      /* capacity is always a power of 2 (unsigned)-1 */    
                      capacity : 6;    
        /* the last 4 bits are important flags */    
    };    
} xs;

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

string in stack

    struct {    
        uint8_t filler[15],                                                                                                                                                            
            /* how many free bytes in this stack allocated string    
             * same idea as fbstring    
             */    
            space_left : 4,    
            /* if it is on heap, set to 1 */    
            is_ptr : 1, is_large_string : 1, flag2 : 1, flag3 : 1;    
    };

space_left 使用了 4 bits , 剛好完整表達了在 stack 的字串長度為 0 - 15 (0b0000 - 0b1111), 將剩下的 4 bits, 用於 flag

TODO: 用 GDB 確認 xs 在記憶體實際的佈局的確符合預期

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

使用 GDB 確認 xs 的記憶體佈局

閱讀 GDB x command

測試用的程式碼, break point 設在 return 0; 行, 利用 GDB x 指令查看記憶體內容。
















int main(int argc, char *argv[])                                                                                                                                                       
{    
    xs string = *xs_tmp("foobarbar");    
    (&string)->is_ptr = 1; 
     
    // size = 19
    xs mstr = *xs_tmp("xxxxxxxxxxxxxxxxxxx");    
    
    // size = 256
    xs lstr = *xs_tmp("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
    // ref + 1
    xs_inc_refcnt(&lstr);    
       
    return 0;    
}

查看 string in stack













// 查看 string 的記憶體位置,
// 發現最後一個 byte 與前部分記憶體配置圖不合 
// space_left is_ptr is_large_string flag2 flag3
//  0110       1             0        0      0
// 0b01101000 = 104 實際上卻是 22
(gdb) x/16c &string
0x7fffffffde50:	102 'f'	111 'o'	111 'o'	98 'b'	97 'a'	114 'r'	98 'b'	97 'a'
0x7fffffffde58:	114 'r'	0 '\000'	0 '\000'	0 '\000'	0 '\000'	0 '\000'	0 '\000'	22 '\026'

// 直接查看最後一個 byte , 發現 bits 配置應爲 flag3 flag2 is_large_string is_ptr space_left (0, 0, 0, 1, 0 1 1 0)
(gdb) x/t 0x7fffffffde5f
0x7fffffffde5f:	00010110

查看 medium string






















// 查看 structure
(gdb) p mstr
$2 = {data = "\240\242UUUU\000\000\023\000\000\000\000\000@\021", {filler = "\240\242UUUU\000\000\023\000\000\000\000\000@", space_left = 1 '\001', is_ptr = 1 '\001', 
    is_large_string = 0 '\000', flag2 = 0 '\000', flag3 = 0 '\000'}, {ptr = 0x55555555a2a0 'x' <repeats 19 times>, size = 19, capacity = 5}}

// 查看指針指向的資料
(gdb) x/32c mstr->ptr
0x55555555a2a0:	120 'x'	120 'x'	120 'x'	120 'x'	120 'x'	120 'x'	120 'x'	120 'x'
0x55555555a2a8:	120 'x'	120 'x'	120 'x'	120 'x'	120 'x'	120 'x'	120 'x'	120 'x'
0x55555555a2b0:	120 'x'	120 'x'	120 'x'	0 '\000'	0 '\000'	0 '\000'	0 '\000'	0 '\000'
0x55555555a2b8:	0 '\000'	0 '\000'	0 '\000'	0 '\000'	0 '\000'	0 '\000'	0 '\000'	0 '\000'

// 確認 size 爲 19
(gdb) x/16d &mstr
0x7fffffffde70:	-96	-94	85	85	85	85	0	0
0x7fffffffde78:	19	0	0	0	0	0	64	17

// capacity 被拆開在兩個 byte 裡面, 需要取 0100000 前兩位 01
// 00010001 後四位 0001 組合 000101 可得 capacity = 5(2^5 = 32)
(gdb) x/2t 0x7fffffffde7E
0x7fffffffde7e:	01000000	00010001

查看 large string








































(gdb) p lstr
$3 = {data = "ТUUUU\000\000\000\001\000\000\000\000@2", {filler = "ТUUUU\000\000\000\001\000\000\000\000@", space_left = 2 '\002', is_ptr = 1 '\001', is_large_string = 1 '\001', 
    flag2 = 0 '\000', flag3 = 0 '\000'}, {ptr = 0x55555555a2d0 "\002", size = 256, capacity = 9}}

// 查看 ptr 可以確認, 前四個 byte 用於計算 refernence count
(gdb) x/264c lstr->ptr
0x55555555a2d0:	2 '\002'	0 '\000'	0 '\000'	0 '\000'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a2d8:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a2e0:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a2e8:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a2f0:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a2f8:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a300:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a308:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a310:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a318:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a320:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a328:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a330:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a338:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a340:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a348:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a350:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a358:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a360:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a368:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a370:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a378:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a380:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a388:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a390:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a398:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a3a0:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a3a8:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a3b0:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a3b8:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a3c0:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a3c8:	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'	97 'a'
0x55555555a3d0:	97 'a'	97 'a'	97 'a'	97 'a'	0 '\000'	0 '\000'	0 '\000'	0 '\000'

前述的記憶體配置示意圖在 bitfield 的部分皆錯誤, 更正後如下圖。

string in heap

    /* heap allocated */    
    struct {    
        char *ptr;    
        /* supports strings up to 2^MAX_STR_LEN_BITS - 1 bytes */    
        size_t size : MAX_STR_LEN_BITS,    
                      /* capacity is always a power of 2 (unsigned)-1 */    
                      capacity : 6;    
        /* the last 4 bits are important flags */    
    };

以 8 bytes 的 pointer to char 指向字串的記憶體位置, 用 size 來表示字串長度, capacity 表示記憶體空間的 power of two, 對於長度大於LARGE_STRING_LEN 的字串, 會將 ptr 指向的位置, 前 4 bytes(int) 用於紀錄reference count, 後 4 bits 沒有被指定到, 會復用上一個 struct 定義的 flags

initialization

利用目標字串的長度決定 xs 字串的儲存位置在 stack 或 heap , 而針對 large string 與否的處理封裝在 xs_allocate_data 之中

xs *xs_new(xs *x, const void *p)
{
    *x = xs_literal_empty();
    size_t len = strlen(p) + 1;
    if (len > NNN) {      // NNN = 16, 需注意是 字串長度加上 null terminator
        x->capacity = ilog2(len) + 1;
        x->size = len - 1;
        x->is_ptr = true;
        xs_allocate_data(x, x->size, 0);
        memcpy(xs_data(x), p, len);
    } else {
        memcpy(x->data, p, len);
        x->space_left = 15 - (len - 1);
    }
    return x;
}

xs_allocate_data 初始化在 heap 中的字串, 根據字串的長度區分 medium string 跟 large string 後在依據 reallocate flag 分別做基於改變現有資料的記憶體大小 realloc 或是配置新記憶體空間malloc

static void xs_allocate_data(xs *x, size_t len, bool reallocate)
{
    /* Medium string */
    if (len < LARGE_STRING_LEN) {
        x->ptr = reallocate ? realloc(x->ptr, (size_t) 1 << x->capacity)
                            : malloc((size_t) 1 << x->capacity);
        return;
    }

    /* Large string */
    x->is_large_string = 1;

    /* The extra 4 bytes are used to store the reference count */
    x->ptr = reallocate ? realloc(x->ptr, (size_t)(1 << x->capacity) + 4)
                        : malloc((size_t)(1 << x->capacity) + 4);

    xs_set_refcnt(x, 1);
}

利用 macro 在初始化之前做檢查

/* Memory leaks happen if the string is too long but it is still useful for
 * short strings.
 */
#define xs_tmp(x)                                                   \
    ((void) ((struct {                                              \
         _Static_assert(sizeof(x) <= MAX_STR_LEN, "it is too big"); \
         int dummy;                                                 \
     }){1}),                                                        \
     xs_new(&xs_literal_empty(), x))

string concatenation and trim

/* grow up to specified size */
xs *xs_grow(xs *x, size_t len)
{
    char buf[16];

    // 已配置的記憶體空間大於新的 len 不需處理
    if (len <= xs_capacity(x))
        return x;

    // 當 data 存在 stack 的時候, 先將 data copy 到 buf
    /* Backup first */
    if (!xs_is_ptr(x))
        memcpy(buf, x->data, 16);

        
    // 由於 xs 初始化爲 stack 時會直接給到最大的 stack capacity, 再進行 grow 的話會轉為使用 heap
    x->is_ptr = true;
    // 直接將容量擴充到可以最小容納新長度字串的 2 的冪記憶體空間
    x->capacity = ilog2(len) + 1;

    
    if (xs_is_ptr(x)) {
        // 已經配置過 heap 記憶體, 改變大小
        xs_allocate_data(x, len, 1);
    } else {
        // 從 stack 轉換成 heap , 配置新的記憶體空間
        xs_allocate_data(x, len, 0);
        memcpy(xs_data(x), buf, 16);
    }
    return x;
}


static inline xs *xs_newempty(xs *x)
{
    *x = xs_literal_empty();
    return x;
}

static inline xs *xs_free(xs *x)
{
    // 使用 heap 的情況下, 需要確定沒有 reference 後才可以釋放記憶體空間
    if (xs_is_ptr(x) && xs_dec_refcnt(x) <= 0)
        free(x->ptr);
    return xs_newempty(x);
}

static bool xs_cow_lazy_copy(xs *x, char **data)
{
    // 並非 large string 或沒有其他參考, 不用作 lazy copy
    if (xs_get_refcnt(x) <= 1)
        return false;

    // 將 large string 的參考減一後, 配置新的記憶體空間給x->ptr
    /* Lazy copy */
    xs_dec_refcnt(x);
    xs_allocate_data(x, x->size, 0);
    
    // 將 data 指向的記憶體空間資料複製一份至新的x->ptr
    // 將 data 指向新的副本
    if (data) {
        memcpy(xs_data(x), *data, x->size);

        /* Update the newly allocated pointer */
        *data = xs_data(x);
    }
    return true;
}

xs *xs_concat(xs *string, const xs *prefix, const xs *suffix)
{
    size_t pres = xs_size(prefix), sufs = xs_size(suffix),
           size = xs_size(string), capacity = xs_capacity(string);

    char *pre = xs_data(prefix), *suf = xs_data(suffix),
         *data = xs_data(string);

    xs_cow_lazy_copy(string, &data);

    // 現有配置的記憶體空間足夠容納新的字串
    if (size + pres + sufs <= capacity) {
        // 將原本 data 的記憶體位置加上 pres 大小 offset 後, 將data 的資料搬移過去, 相當於在 data 平移 pres 大小的 offset
        memmove(data + pres, data, size);
        // 將 pre 複製至上一步平移後騰出來的頭部空間
        memcpy(data, pre, pres);
        // 將 suf 放到平移後的 data 尾部
        memcpy(data + pres + size, suf, sufs + 1);

        // 依據 stack 或 heap 更新 size
        if (xs_is_ptr(string))
            string->size = size + pres + sufs;
        else
            string->space_left = 15 - (size + pres + sufs);
    } else {
        // 空間不足, 產生新的 xs
        xs tmps = xs_literal_empty();
        xs_grow(&tmps, size + pres + sufs);
        // 取得新的 xs 的記憶體空間位置
        char *tmpdata = xs_data(&tmps);
        // 依序複製資料到對應的記憶體位置
        memcpy(tmpdata + pres, data, size);
        memcpy(tmpdata, pre, pres);
        memcpy(tmpdata + pres + size, suf, sufs + 1);
        // 將原本的string 佔用的空間釋放
        xs_free(string);
        // 將string指向新的 xs, 並設定新的 size
        *string = tmps;
        string->size = size + pres + sufs;
    }
    return string;
}

xs *xs_trim(xs *x, const char *trimset)
{
    // 需要修剪的字串爲空
    if (!trimset[0])
        return x;
    
    char *dataptr = xs_data(x), *orig = dataptr;
    // 如果有需要產生副本則將 orig 指向新的副本
    if (xs_cow_lazy_copy(x, &dataptr))
        orig = dataptr;

    /* similar to strspn/strpbrk but it operates on binary data */
    uint8_t mask[32] = {0};

// 第一次看到以爲是類似 bloom filter 的方式, 看了課堂問答才知道是很巧妙的查表 ref: https://hackmd.io/@sysprog/rk9fG7zG_?fbclid=IwAR3rRvcKQtFMOmUQ8CRkUCKABhlk-gZRKC6M7S4BDh-XKMOa-uWCIcFix-s
#define check_bit(byte) (mask[(uint8_t)byte/8]&1 << (uint8_t)byte %8)
#define set_bit(byte) (mask[(uint8_t)byte/8]|=1<<(uint8_t)byte %8)
    size_t i, slen = xs_size(x), trimlen = strlen(trimset);

    for (i = 0; i < trimlen; i++)
        set_bit(trimset[i]);
    // 利用查表找出 string 在移除字母表裡的第一個字元跟最後一個字元
    for (i = 0; i < slen; i++)
        if (!check_bit(dataptr[i]))
            break;
    for (; slen > 0; slen--)
        if (!check_bit(dataptr[slen - 1]))
            break;
    // 移動到不被刪除的第一個字元, 跟計算新字串的長度
    dataptr += i;
    slen -= i;

    /* reserved space as a buffer on the heap.
     * Do not reallocate immediately. Instead, reuse it as possible.
     * Do not shrink to in place if < 16 bytes.
     */
    memmove(orig, dataptr, slen);
    /* do not dirty memory unless it is needed */
    if (orig[slen])
        orig[slen] = 0;

    if (xs_is_ptr(x))
        x->size = slen;
    else
        x->space_left = 15 - slen;
    return x;
#undef check_bit
#undef set_bit
}

similar concept in Redis object

回頭去看之前嘗試閱讀的 redis 的程式碼, 做過 quiz3 後對這個 struct 設計有更進一步的了解

typedef struct redisObject {
    unsigned type:4;
    unsigned encoding:4;
    unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or
                            * LFU data (least significant 8 bits frequency
                            * and most significant 16 bits access time). */
    int refcount;
    void *ptr;
} robj;

Jim Huang

2021/03/21 21:02:18

```c /* grow up to specified size */ xs *xs_grow(xs *x, size_t len) { char buf[16]; // 已配置的記憶體空間大於新的 len 不需處理 if (len <= xs_capacity(x)) return x; // 當 data 存在 stack 的時候, 先將 data copy 到 buf /* Backup first */ if (!xs_is_ptr(x)) memcpy(buf, x->data, 16); // 由於 xs 初始化爲 stack 時會直接給到最大的 stack capacity, 再進行 grow 的話會轉為使用 heap x->is_ptr = true; // 直接將容量擴充到可以最小容納新長度字串的 2 的冪記憶體空間 x->capacity = ilog2(len) + 1; if (xs_is_ptr(x)) { // 已經配置過 heap 記憶體, 改變大小 xs_allocate_data(x, len, 1); } else { // 從 stack 轉換成 heap , 配置新的記憶體空間 xs_allocate_data(x, len, 0); memcpy(xs_data(x), buf, 16); } return x; } static inline xs *xs_newempty(xs *x) { *x = xs_literal_empty(); return x; } static inline xs *xs_free(xs *x) { // 使用 heap 的情況下, 需要確定沒有 reference 後才可以釋放記憶體空間 if (xs_is_ptr(x) && xs_dec_refcnt(x) <= 0) free(x->ptr); return xs_newempty(x); } static bool xs_cow_lazy_copy(xs *x, char **data) {

由於 n 尚未在程式碼出現，可將「2 的 n 次方」改為「2 的冪」，以降低誤解 (Edited)

2021q1 Homework3 (quiz3)

tags: linux2021

struct

string in stack

使用 GDB 確認 xs 的記憶體佈局

string in heap

initialization

string concatenation and trim

similar concept in Redis object

Read more

2023 Homework3

linux2023: Julian-Chu

2022q1 Homework6 (ktcp)

2022q1 Homework1 (lab0)

tags: `linux2021`