# facebooc-q1 ###### tags: `facebooc 2022` **Tribute to:** < `jserv` > ## 作業說明 ### `list.h` 改寫自:[`Linux/list.h`](https://github.com/torvalds/linux/blob/master/include/linux/list.h) 的實作。利用 `struct list_head` 資料結構,建構一個巧妙的 circular doubly linked-list, ```c struct list_head { struct list_head *prev, *next; }; ``` 程式運作原理說明:[Linked list 在 Linux 核心原始程式碼](https://hackmd.io/@sysprog/c-linked-list#Linked-list-%E5%9C%A8-Linux-%E6%A0%B8%E5%BF%83%E5%8E%9F%E5%A7%8B%E7%A8%8B%E5%BC%8F%E7%A2%BC) **我們提供以下程式碼進行改寫:** ```c #include <stddef.h> /** * container_of() - Calculate address of object that contains address ptr * @ptr: pointer to member variable * @type: type of the structure containing ptr * @member: name of the member variable in struct @type * * Return: @type pointer of object containing ptr */ #ifndef container_of #define container_of(ptr, type, member) \ __extension__({ \ const __typeof__(((type *) 0)->member) *__pmember = (ptr); \ (type *) ((char *) __pmember - offsetof(type, member)); \ }) #endif /** * struct list_head - Head and node of a doubly-linked list * @prev: pointer to the previous node in the list * @next: pointer to the next node in the list */ struct list_head { struct list_head *prev, *next; }; /** * LIST_HEAD - Declare list head and initialize it * @head: name of the new object */ #define LIST_HEAD(head) struct list_head head = {&(head), &(head)} /** * INIT_LIST_HEAD() - Initialize empty list head * @head: pointer to list head */ static inline void INIT_LIST_HEAD(struct list_head *head) { head->next = head; head->prev = head; } /** * list_add_tail() - Add a list node to the end of the list * @node: pointer to the new node * @head: pointer to the head of the list */ static inline void list_add_tail(struct list_head *node, struct list_head *head) { struct list_head *prev = head->prev; prev->next = node; node->next = head; node->prev = prev; head->prev = node; } /** * list_remove() - Remove a list node from the list * @node: pointer to the node */ static inline void list_remove(struct list_head *node) { struct list_head *next = node->next, *prev = node->prev; next->prev = prev; prev->next = next; } /** * list_empty() - Check if list head has no nodes attached * @head: pointer to the head of the list */ static inline int list_empty(const struct list_head *head) { return (head->next == head); } /** * list_is_singular() - Check if list head has exactly one node attached * @head: pointer to the head of the list */ static inline int list_is_singular(const struct list_head *head) { return (!list_empty(head) && head->prev == head->next); } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } /** * list_swap_ptr - swap two pointer's value * @ptr1: the first pointer * @ptr2: the second pointer */ static inline void list_swap_ptr(char **ptr1, char **ptr2) { char *tmp = *ptr1; *ptr1 = *ptr2; *ptr2 = tmp; } /** * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position * @entry1: the location to place entry2 * @entry2: the location to place entry1 */ static inline void list_swap(struct list_head *entry1, struct list_head *entry2) { struct list_head *pos = entry2->prev; list_remove(entry2); list_replace(entry1, entry2); if (pos == entry1) pos = entry2; list_add_tail(entry1, pos); } /** * list_splice_tail() - Add list nodes from a list to end of another list * @list: pointer to the head of the list with the node entries * @head: pointer to the head of the list */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { struct list_head *head_last = head->prev; struct list_head *list_first = list->next, *list_last = list->prev; if (list_empty(list)) return; head->prev = list_last; list_last->next = head; list_first->prev = head_last; head_last->next = list_first; } /** * list_cut_position() - Move beginning of a list to another list * @head_to: pointer to the head of the list which receives nodes * @head_from: pointer to the head of the list * @node: pointer to the node in which defines the cutting point */ static inline void list_cut_position(struct list_head *head_to, struct list_head *head_from, struct list_head *node) { struct list_head *head_from_first = head_from->next; if (list_empty(head_from)) return; if (list_is_singular(head_from) && (head_from->next != node && head_from != node)) return; if (head_from == node) { INIT_LIST_HEAD(head_to); return; } head_from->next = node->next; head_from->next->prev = head_from; head_to->prev = node; node->next = head_to; head_to->next = head_from_first; head_to->next->prev = head_to; } /** * list_entry() - Calculate address of entry that contains list node * @node: pointer to list node * @type: type of the entry containing the list node * @member: name of the list_head member variable in struct @type */ #define list_entry(node, type, member) container_of(node, type, member) /** * list_first_entry() - get first entry of the list * @head: pointer to the head of the list * @type: type of the entry containing the list node * @member: name of the list_head member variable in struct @type */ #define list_first_entry(head, type, member) \ list_entry((head)->next, type, member) /** * list_for_each - iterate over list nodes * @node: list_head pointer used as iterator * @head: pointer to the head of the list */ #define list_for_each(node, head) \ for (node = (head)->next; node != (head); node = node->next) /** * list_for_each_prev_reverse - iterate over a list backwards for reverse the list * @node: list_head pointer used as iterator * @head: pointer to the head of the list */ #define list_for_each_next_reverse(node, head) \ for (node = (head->next); node != (head); node = node->prev) /** * shamelessly copy from linux/list.h * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) ``` --- ### folly::FBString [folly::FBString](https://github.com/facebook/folly/blob/main/folly/FBString.h) 實作主要手法透過下方的 union: ```c union { struct { char *data; size_t size; size_t capacity; } heap; struct { char data[23]; int8_t length; // align 1 } stack; }; ``` 分為「短字串」以及「中長字串」,分開進行處理。「短字串」保存在 `char data[23]`,而「中長字串」則透過 data 這個指標和 `malloc` 的使用,保存在 heap 上。 ### `xs.c` 在伺服器運算 ([server-side](https://en.wikipedia.org/wiki/Server-side) computing) 領域中,常會遇到頻繁的短字串處理,為此 Facebook 特別發展 [folly::fbstring](https://github.com/facebook/folly/blob/main/folly/FBString.h) 作為 C++ [std::string](https://legacy.cplusplus.com/reference/string/string/) 的高效能替代品,而之所以可以有效能上的突破,在於透過緊湊的記憶體佈局,施行 SSO (small string optimization) 和 CoW (copy on write) 這兩種最佳化手法: * 當字串被歸類為「短字串」,會使用 stack 上的空間來保存字串。 * 當字串被視作「中等長度的字串」,採用動態配置記憶體; * 當字串倍視作「長字串」,會透過 CoW 手段進行最佳化:進行字串複製操作時,倘若字串本身沒有修改,就會共享記憶體空間,從而減少記憶體佔用和省去複製的開銷。換言之,真正的「複製」操作僅發生在字串內容發生變更。 在 stack 上的空間 (也就是不計算透過 malloc 所取得的 heap 空間) 佔用尤其精巧,[folly::FBString](https://github.com/facebook/folly/blob/main/folly/FBString.h) 本體使用 24 個位元組,但卻能表達 23 個位元組長度的字串,沒有任何ㄧ個位元組浪費。相較之下,[std::string](https://legacy.cplusplus.com/reference/string/string/) 本身佔用 32 個位元組,但在 heap 上卻只能表達 16 個位元組的字串。 我們可以效仿 [folly::FBString](https://github.com/facebook/folly/blob/main/folly/FBString.h) 的資料結構實作一個山寨版: ```c #define MAX_STR_LEN_BITS (54) #define MAX_STR_LEN ((1UL << MAX_STR_LEN_BITS) - 1) #define LARGE_STRING_LEN 256 #define MIDDLE_STRING_LEN 16 typedef union { /* allow strings up to 15 bytes to stay on the stack * use the last byte as a null terminator and to store flags * much like fbstring: * https://github.com/facebook/folly/blob/master/folly/docs/FBString.md */ char data[MIDDLE_STRING_LEN]; struct { uint8_t filler[15], /* how many free bytes in this stack allocated string * same idea as fbstring */ space_left : 4, /* if it is on heap, set to 1 */ is_ptr : 1, is_large_string : 1, flag2 : 1, flag3 : 1; }; /* heap allocated */ struct { char *ptr; /* supports strings up to 2^54 - 1 bytes */ size_t size : 54, /* capacity is always a power of 2 (unsigned)-1 */ capacity : 6; /* the last 4 bits are important flags */ }; } xs; ``` `xs` `union` 定義 16 個位元組,應用與實作細節如下: * 字串長度小於或等於 15 個位元組,則放在 stack。 * 字串長度大於或等於 16 個位元組,則放在 heap (透過 `malloc` 系統呼叫配置所需字串大小) * heap struct * `ptr`: 8 個位元組指標 (64 位元架構: x86_64/aarch64 等等)。 * `size`: 字串長度。因定義 54 位元,故最大字串長度為 $2^{54}$ − 1 位元組。 * `capacity`: 從 heap 配置的空間大小,其單位為 $2^{capacity}$,故用 6 個位元即可涵蓋 `size` 長度。 * `is_ptr`: 若有從 heap 配置空間,`is_ptr` 值為 1,否則為 0。 * `is_large_string`: 若字串長度 > `LARGE_STRING_LEN`,視為「長字串」,`is_large_string` 值為 1,否則為 0。 * 有 2 個位元沒有定義,是為了避免覆寫另一結構成員: `flag 2` 與 `flag3`。 **我們提供以下程式碼進行改寫:** ```c #include <stdbool.h> #include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #include <unistd.h> #define MAX_STR_LEN_BITS (54) #define MAX_STR_LEN ((1UL << MAX_STR_LEN_BITS) - 1) #define LARGE_STRING_LEN 256 #define MIDDLE_STRING_LEN 16 /* Memory leaks happen if the string is too long but it is still useful for * short strings. */ #define xs_tmp(x) \ ((void) ((struct { \ _Static_assert(sizeof(x) <= MAX_STR_LEN, "it is too big"); \ int dummy; \ }){1}), \ xs_new(&xs_literal_empty(), x)) #define xs_literal_empty() \ (xs) { .space_left = 15 } enum xs_type { XS_SHORT = 0, XS_MEDIUM, XS_LARGE, }; typedef union { /* allow strings up to 15 bytes to stay on the stack * use the last byte as a null terminator and to store flags * much like fbstring: * https://github.com/facebook/folly/blob/master/folly/docs/FBString.md */ char data[MIDDLE_STRING_LEN]; struct { uint8_t filler[15], /* how many free bytes in this stack allocated string * same idea as fbstring */ space_left : 4, /* if it is on heap, set to 1 */ is_ptr : 1, is_large_string : 1, flag2 : 1, flag3 : 1; }; /* heap allocated */ struct { char *ptr; /* supports strings up to 2^54 - 1 bytes */ size_t size : 54, /* capacity is always a power of 2 (unsigned)-1 */ capacity : 6; /* the last 4 bits are important flags */ }; } xs; static inline bool xs_is_ptr(const xs *x) { return x->is_ptr; } static inline bool xs_is_large_string(const xs *x) { return x->is_large_string; } size_t xs_size(const xs *x) { return xs_is_ptr(x) ? x->size : 15 - x->space_left; } char *xs_data(const xs *x) { if (!xs_is_ptr(x)) return (char *) x->filler; if (xs_is_large_string(x)) return (char *) (x->ptr + /* HEADER */ 4); return (char *) x->ptr; } int xs_type(const xs *x) { if (!x->is_ptr) return XS_SHORT; if (x->is_large_string) return XS_LARGE; return XS_MEDIUM; } size_t xs_capacity(const xs *x) { return xs_is_ptr(x) ? ((size_t) 1 << x->capacity) - 1 : 15; } static inline void xs_set_refcnt(const xs *x, int val) { *((int *) ((size_t) x->ptr)) = val; } #define xs_literal_empty() \ (xs) { .space_left = 15 } /* lowerbound (floor log2) */ static inline int ilog2(uint32_t n) { return 32 - __builtin_clz(n) - 1; } static void xs_allocate_data(xs *x, size_t len, bool reallocate) { /** * This function can be improved by: * Delete the passing the arg @len. Therefore, change the if stmt, * using just x->capacity to determine whether @x is small, middle, or large string. */ /* Small string */ if (len < MIDDLE_STRING_LEN) { return; } /* Medium string */ if (len < LARGE_STRING_LEN) { x->ptr = reallocate ? realloc(x->ptr, (size_t) 1 << x->capacity) : malloc((size_t) 1 << x->capacity); x->is_large_string = 0; return; } /* Large string */ x->is_large_string = 1; /* The extra 4 bytes are used to store the reference count */ x->ptr = reallocate ? realloc(x->ptr, (size_t)(1 << x->capacity) + 4) : malloc((size_t)(1 << x->capacity) + 4); xs_set_refcnt(x, 1); } static inline void xs_inc_refcnt(const xs *x) { if (xs_is_large_string(x)) ++(*(int *) ((size_t) x->ptr)); } static inline int xs_dec_refcnt(const xs *x) { if (!xs_is_large_string(x)) return 0; return --(*(int *) ((size_t) x->ptr)); } static inline int xs_get_refcnt(const xs *x) { if (!xs_is_large_string(x)) return 0; return *(int *) ((size_t) x->ptr); } /** * xs_new create a xs string, allocate the necessary memory and copy the * the bytes from p to xs_data(x). If strlen(p) is greater or equal to * LARGE_STRING_LEN, use string interning to share memory address if possible. */ xs *xs_new(xs *x, const char *p) { *x = xs_literal_empty(); size_t len = strlen(p) + 1; if (len > LARGE_STRING_LEN) { x->capacity = ilog2(len) + 1; x->size = len - 1; x->is_ptr = true; x->is_large_string = true; /* TODO: String interning method */ } else if (len > MIDDLE_STRING_LEN) { x->capacity = ilog2(len) + 1; x->size = len - 1; x->is_ptr = true; xs_allocate_data(x, x->size, 0); memcpy(xs_data(x), p, len); } else { memcpy(x->data, p, len); x->space_left = 15 - (len - 1); } return x; } /* grow up to specified size */ xs *xs_grow(xs *x, size_t len) { if (len <= xs_capacity(x)) return x; x->capacity = ilog2(len) + 1; if (xs_is_ptr(x)) { xs_allocate_data(x, len, 1); } else { x->is_ptr = true; xs_allocate_data(x, len, 0); } /* TODO: Large string case */ return x; } static inline xs *xs_newempty(xs *x) { *x = xs_literal_empty(); return x; } static inline xs *xs_free(xs *x) { if (xs_is_ptr(x) && xs_dec_refcnt(x) <= 0) free(x->ptr); return xs_newempty(x); } static bool xs_cow_lazy_copy(xs *x, char **data) { if (!x->is_large_string) { memcpy(xs_data(x), *data, strlen(*data)); return true; } else if (xs_get_refcnt(x) > 1) { /* Lazy copy */ xs_dec_refcnt(x); /* TODO: String interning method */ } if (data) { memcpy(xs_data(x), *data, x->size); } /* Update the newly allocated pointer */ *data = xs_data(x); return true; } xs *xs_concat(xs *string, const xs *prefix, const xs *suffix) { size_t pres = xs_size(prefix), sufs = xs_size(suffix), size = xs_size(string), capacity = xs_capacity(string); char *pre = xs_data(prefix), *suf = xs_data(suffix), *data = xs_data(string); if (xs_get_refcnt(string) > 1) xs_cow_lazy_copy(string, &data); if (size + pres + sufs <= capacity) { memmove(data + pres, data, size); memcpy(data, pre, pres); memcpy(data + pres + size, suf, sufs + 1); if (xs_is_ptr(string)) string->size = size + pres + sufs; else string->space_left = 15 - (size + pres + sufs); } else { xs tmps = xs_literal_empty(); xs_grow(&tmps, size + pres + sufs); char *tmpdata = xs_data(&tmps); memcpy(tmpdata + pres, data, size); memcpy(tmpdata, pre, pres); memcpy(tmpdata + pres + size, suf, sufs + 1); xs_free(string); *string = tmps; string->size = size + pres + sufs; } /* TODO: String interning if is large string */ return string; } xs *xs_trim(xs *x, const char *trimset) { if (!trimset[0]) return x; char *dataptr = xs_data(x), *orig = dataptr; if (xs_get_refcnt(x) > 1) { xs_cow_lazy_copy(x, &dataptr); orig = dataptr; } /* Similar to strspn/strpbrk but it operates on binary data */ uint8_t mask[32] = {0}; #define check_bit(byte) (mask[(uint8_t) byte >> 3] & 1 << (uint8_t) byte & 7) #define set_bit(byte) (mask[(uint8_t) byte >> 3] |= 1 << (uint8_t) byte & 7) size_t i, slen = xs_size(x), trimlen = strlen(trimset); for (i = 0; i < trimlen; i++) set_bit(trimset[i]); for (i = 0; i < slen; i++) if (!check_bit(dataptr[i])) break; for (; slen > 0; slen--) if (!check_bit(dataptr[slen - 1])) break; dataptr += i; slen -= i; /* reserved space as a buffer on the heap. * Do not reallocate immediately. Instead, reuse it as possible. * Do not shrink to in place if < 16 bytes. */ memmove(orig, dataptr, slen); /* do not dirty memory unless it is needed */ if (orig[slen]) orig[slen] = 0; if (xs_is_ptr(x)) x->size = slen; else x->space_left = 15 - slen; /* TODO: String interning if is large string */ return x; #undef check_bit #undef set_bit } void xs_erase(xs *x) { if (xs_is_ptr(x)) { free(x->ptr); } memset(x, 0, sizeof(xs)); return; } ``` :::success **作業要求:** 1. 解釋 `list.h`、`xs.c` 的程式碼以及其運作原理。 2. 利用 `xs.c` 的實作,改寫第一次作業,並且設計實驗,與第一次的作業進行比較,cache 的使用以及效能上的差異。最後解釋你的實驗結果以及猜想。 3. 練習利用 `list.h` 的實作,針對 Linked list 的資料結構,改寫第一次的作業。 4. 研究 [字串駐留 (String Interning)](https://en.wikipedia.org/wiki/String_interning) 的手法,完善 `xs.c` 內部對於「長字串」處理的實作,並且提出對於 `xs.c` 程式碼更好的實作以及改進。 這裡提供 [字串駐留 (String Interning) 的相關實作](https://hackmd.io/@Zxiro/rk4qw64s_) 作為參考。 > Tribute to Zxiro ::: :::info 貼心提醒:作業的繳交期限沒有限制。 ::: --- ## 作業區 (HackMD / Github) :::info :exclamation: 無論標題和內文,中文和英文字元之間要有空白字元 (對排版和文字搜尋有利);文字訊息請避免用圖片來表示,否則不好搜尋和分類。 :notes: 「開發紀錄」的 HackMD 網址 **不一定** 要用「固定網址」,也就是如 https://hackmd.io/@itsme/XXXX 的形式,設定公開發表並允許已登入者進行編輯,請留意! :coffee: 我們在開發的過程中,所有人都能看得到其他人寫的共筆,大家可以互相參考、留下評論,當然也可以拿來引用,但是,引用時,請註明你們引用的出處。 ::: - [ ] XXXX <--- Discord 暱稱 * [開發紀錄](https://) / [Github](https://) - [ ] Astus * [開發紀錄](https://hackmd.io/xCOla4f6R0yDXdjS7AoCog?view) / [Github](https://) - [ ] peter lee * [開發紀錄](/lLLczFH_StOv1aEqGJ_Uwg) / [Github](https://)