# 2021q1 Homework2 (quiz2)
contributed by < [`andy78644`](https://github.com/andy78644) >
###### tags: `2021 linux`
## Q1
### 程式運作原理
```clike=
void list_merge_sort(queue_t *q)
{
if (list_is_singular(&q->list))
return;
queue_t left;
struct list_head sorted;
INIT_LIST_HEAD(&left.list);
list_cut_position(&left.list, &q->list, MMM);
list_merge_sort(&left);
list_merge_sort(q);
list_merge(&left.list, &q->list, &sorted);
INIT_LIST_HEAD(&q->list);
list_splice_tail(&sorted, &q->list);
}
```
```clike=
static inline void INIT_LIST_HEAD(struct list_head *head)
{
head->next = head; head->prev = head;
}
```
```graphviz
digraph structs {
node[shape=record]
struct1 [label="<prev> prev|<next> next"];
struct1:prev -> struct1[arrowhead=vee, tailclip=true, arrowsize=1];
struct1:next -> struct1[arrowhead=vee, tailclip=true, arrowsize=1];
}
```
將傳入的 head 的 prev 和 next 指向 head 進行初始化的動作
```clike=
static inline void list_cut_position(struct list_head *head_to,
struct list_head *head_from,
struct list_head *node)
{
struct list_head *head_from_first = head_from->next;
if (list_empty(head_from))
return;
if (head_from == node) {
INIT_LIST_HEAD(head_to);
return;
}
head_from->next = node->next;
head_from->next->prev = head_from;
head_to->prev = node;
node->next = head_to;
head_to->next = head_from_first;
head_to->next->prev = head_to;
}
```
將 list 從 node 切割
```graphviz
digraph structs {
rankdir=LR;
node[shape=box];
{
node[shape=record];
struct0 [label="<prev> prev|head_from|<next> next"];
struct1 [label="<prev> prev|head_to|<next> next"];
struct7 [label="<prev> prev|head_fromfirst|<next> next"];
}
#struct0 [label= "head_from|next"];
#struct1 [label= "head_to"];
struct2 [label= "node1"];
struct3 [label= "node2 | node"];
struct4 [label= "node3 "];
struct5 [label= "node4"];
#struct6 [label= "node"];
{
rank="LR";
struct0 -> struct2[dir=both]
}
{
#rank="same";
}
struct7 -> struct2
struct2 -> struct3[arrowhead=vee, dir=both, tailclip=true, arrowsize=1];
struct3 -> struct4[arrowhead=vee, dir=both, tailclip=true, arrowsize=1];
struct4 -> struct5[arrowhead=vee, dir=both, tailclip=true, arrowsize=1];
struct5 -> struct0[arrowhead=vee, dir=both,tailclip=true, arrowsize=1];
}
```
```graphviz
digraph structs {
rankdir=LR;
node[shape=box];
{
node[shape=record];
struct0 [label="<prev> prev|head_from|<next> next"];
struct1 [label="<prev> prev|head_to|<next> next"];
struct7 [label="<prev> prev|head_fromfirst|<next> next"];
}
#struct0 [label= "head_from|next"];
#struct1 [label= "head_to"];
struct2 [label= "node1"];
struct3 [label= "node2 | node"];
struct4 [label= "node3"];
struct5 [label= "node4"];
#struct6 [label= "node"];
struct0 -> struct4[dir=both]
#rank="same";
struct1 -> struct3[dir=both]
struct7 -> struct2
struct2 -> struct3[arrowhead=vee, dir=both, tailclip=true, arrowsize=1];
struct4 -> struct5[arrowhead=vee, dir=both, tailclip=true, arrowsize=1];
struct5 -> struct0[arrowhead=vee, dir=both,tailclip=true, arrowsize=1];
}
```
```graphviz
digraph structs {
rankdir=LR;
node[shape=box];
{
node[shape=record];
struct0 [label="<prev> prev|head_from|<next> next"];
struct1 [label="<prev> prev|head_to|<next> next"];
struct7 [label="<prev> prev|head_fromfirst|<next> next"];
}
#struct0 [label= "head_from|next"];
#struct1 [label= "head_to"];
struct2 [label= "node1"];
struct3 [label= "node2 | node"];
struct4 [label= "node3"];
struct5 [label= "node4"];
#struct6 [label= "node"];
struct0 -> struct4[dir=both]
#rank="same";
struct1 -> struct3[dir=both]
struct1 -> struct2[dir=both]
struct7 -> struct2
struct2 -> struct3[arrowhead=vee, dir=both, tailclip=true, arrowsize=1];
struct4 -> struct5[arrowhead=vee, dir=both, tailclip=true, arrowsize=1];
struct5 -> struct0[arrowhead=vee, dir=both,tailclip=true, arrowsize=1];
}
```
```clike=
static list_ele_t *get_middle(struct list_head *list)
{
struct list_head *fast = list->next, *slow;
list_for_each (slow, list) {
if (COND1 || COND2)
break;
fast = fast->next->next;
}
return list_entry(TTT, list_ele_t, list);
}
```
* slow 每次會往前移一個而 fast 每次會往前移兩位,而當 fast 到達最尾端時 slow 剛好到達中間點,因此回傳 slow 即可得知中間節點
* 因此 COND1 與 COND2 應該分別為 fast->next 以及 fast->next->next
* list_entry 則可透過 TTT 這個 list 去找到 list_ele_t 的位置,由於我們是要找中間節點,因此 TTT 為 slow
```clike=
static void list_merge(struct list_head *lhs,
struct list_head *rhs,
struct list_head *head)
{
INIT_LIST_HEAD(head);
if (list_empty(lhs)) {
list_splice_tail(lhs, head);
return;
}
if (list_empty(rhs)) {
list_splice_tail(rhs, head);
return;
}
while (!list_empty(lhs) && !list_empty(rhs)) {
char *lv = list_entry(lhs->next, list_ele_t, list)->value;
char *rv = list_entry(rhs->next, list_ele_t, list)->value;
struct list_head *tmp = strcmp(lv, rv) <= 0 ? lhs->next : rhs->next;
list_del(tmp);
list_add_tail(tmp, head);
}
list_splice_tail(list_empty(lhs) ? rhs : lhs, head);
}
```
* 將兩個串列合併,由於兩個串列都應為已排序好,因此透過分別從兩個串列的頭到尾依序比較每個 value ,將較小的插入 head 的串列即可完成合併
### 改善
```clike=
typedef struct __element {
char *value;
struct __element *next;
struct list_head list;
} list_ele_t;
```
* 在 list_ele_t 由於 list_head 裡已經有一個 next 的指標 ,在多一個 next 指標就是多餘的因此下面有針對拿掉next指標後,對於記憶體使用的變化量進行實驗
```diff=
typedef struct __element {
char *value;
- struct __element *next;
+ // struct __element *next;
struct list_head list;
} list_ele_t;
```
```diff=
static void q_free(queue_t *q)
list_ele_t *current = q->head;
while (current) {
list_ele_t *tmp = current;
- current = current->next;
+ if (current->list.next == &(q->list))
+ break;
+ current = list_entry(current->list.next, list_ele_t, list);
free(tmp->value);
free(tmp);
}
```
修改函式: q_insert_head
```diff
newh->value = new_value;
- newh->next = q->head;
+ newh->list.next = &(q->head->list);
q->head = newh;
if (q->size == 0)
q->tail = newh;
```
後面又發現 queue_t 可以直接由 list_head 進行取代,因為這個實作裡沒有使用到 queue_t ,因此在下面實作也透過用 list_head 來取代 queue_t 所使用的記憶體量
此為 list_sort_not_q.c 裡實作不用 queue_t 的方式
```clike=
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include "list.h"
#include "list_sort_no_q.h"
static struct list_head *list_new() {
struct list_head *head = malloc(sizeof(struct list_head));
if (!head)
return NULL;
INIT_LIST_HEAD(head);
return head;
}
bool list_insert_head(struct list_head *head, char *s) {
if (!head)
return false;
list_ele_t *newh = malloc(sizeof(list_ele_t));
if (!newh)
return false;
char *new_value = strdup(s);
if (!new_value) {
free(newh);
return false;
}
newh->value = new_value;
// newh->list.next = head->next;
// head->next = newh;
list_add_tail(&(newh->list), head);
}
static void list_free(struct list_head *head) {
if (!head)
return;
struct list_head *current = head->next;
while (current) {
if (current == head) {
break;
}
list_ele_t *tmp = list_entry(current, list_ele_t, list);
current = current->next;
free(tmp->value);
free(tmp);
}
}
int main(void)
{
FILE *fp = fopen("cities.txt", "r");
if (!fp) {
perror("failed to open cities.txt");
exit(EXIT_FAILURE);
}
struct list_head *head = list_new();
char buf[256];
while (fgets(buf, 256, fp))
list_insert_head(head, buf);
fclose(fp);
list_merge_sort(head);
// assert(validate(q));
list_free(head);
return 0;
}
```
原本的 list_merge_sort 為將 quete_t 當作參數,這邊將它改為傳入list_head
```clike=
void list_merge_sort(struct list_head *head) {
if (list_is_singular(head))
return;
struct list_head left;
struct list_head sorted;
INIT_LIST_HEAD(&left);
list_cut_position(&left, head, &get_middle(head)->list);
list_merge_sort(&left);
list_merge_sort(head);
list_merge(&left, head, &sorted);
INIT_LIST_HEAD(head);
list_splice_tail(&sorted, head);
}
```
1. 在節點結構中有 next
![](https://i.imgur.com/HOZqSon.png)
![](https://i.imgur.com/GGemQhc.png)
2. 直接使用 list 中的 next
![](https://i.imgur.com/nr0rYMy.png)
![](https://i.imgur.com/jgcTDAU.png)
3. 未使用 queue_t 直接使用 list_head
![](https://i.imgur.com/xtMzRiH.png)
![](https://i.imgur.com/f4nFZH2.png)
---
## Q2
```clike=
uint16_t func(uint16_t N) {
/* change all right side bits to 1 */
N |= N >> 1;
N |= N >> X;
N |= N >> Y;
N |= N >> Z;
return (N + 1) >> 1;
}
```
`FUNC` 的目標是要找到小於或等於 N 的 power-of-2 ,這邊使用的方式是先將 last set bit 的右邊都變成1,我們假設 N = (0...01....),在第一次 N |= N >> 1 , N 則會變成 (0..011......),那在下一次則變成 N |= N >> 2 , N 則會變成 (0..01111...),以此類推每次右移都為 power-of-2(1,2,4,8,.....),由於 N 為16 bit ,因此最後只需右移4次即可將16 bit 的所有可能涵蓋。
```clike=
#define roundup_pow_of_two(n) \
( \
__builtin_constant_p(n) ? ( \
(n == 1) ? 1 : \
(1UL << (ilog2((n) - 1) + 1)) \
) : \
__roundup_pow_of_two(n) \
)
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
return 1UL << fls_long(n - 1);
}
```
首先我們可以看到 `__roundup_pow_of_two` 裡用到的 `fls_long(N-1)` ,主要是用來找到 position of last set bit ,當將 1UL 左移這個位置的 bit 數後,即會 set last set bit 的 前一個 bit ,這個值即是大於 N 的 power-of-2 ,而傳入 `fls_long` 的參數為 n-1 則是因為如果 n 剛好為 power-of-2 則是直接回傳 n ,因此如果先減1再做 `fls_long` 找到的才會剛好是 n
:::info
__builtin_constant_p 為在 gcc 中的內建函數,主要用於檢測傳入值是否為在編譯時期就可被定義為常數, gcc 內的敘述如下
Built-in Function: int __builtin_constant_p (exp)
You can use the built-in function __builtin_constant_p to determine if a value is known to be constant at compile time and hence that GCC can perform constant-folding on expressions involving that value. The argument of the function is the value to test. The function returns the integer 1 if the argument is known to be a compile-time constant and 0 if it is not known to be a compile-time constant. A return of 0 does not indicate that the value is not a constant, but merely that GCC cannot prove it is a constant with the specified value of the -O option.
You typically use this function in an embedded application where memory is a critical resource. If you have some complex calculation, you may want it to be folded if it involves constants, but need to call a function if it does not.
:::
:::info
這邊有個問題就是不知道為什麼要先確認是否為常數?
> 設計實驗,觀察產出來的組合語言 (記得要打開 `-O2` 或更高等級的最佳化) 在給定常數和非常數表示的狀況下,有何落差。Linux 核心相當依賴 GNU extension,你需要更深入理解。
> :notes: jserv
:::
#### 相關實驗
* 將原本 linux kernel 的 `roundup_pow_of_two` 移出程獨立程式並且分別給定常數與非常數去進行測試
* 在常數的情形下, gcc 在編譯的過程中已經將結果算出並且在執行過程實是直接利用結果來印出
* 在非常數的情形下,程式會進入 ` __roundup_pow_of_two(n)` 來進行運算,此情形相較於常數來說效能較差
* 在最後我們有放上當非常數使用 ilog2() 裡計算常數冪次的方式,發現所許使用的比較次數比原本 `fls_long()` 還有多很多
:::info
為什麼常數需要使用叫慢的程式碼,雖然它是在編譯時期即執行
:::
```shell=
gcc -O2 -S constant_test.c -o constant_test
```
```clike=
int main() {
int a = 6;
volatile int x;
x = a;
a = roundup_pow_of_two(x);
printf("%d\n", a);
return 0;
}
```
非常數
```clike=
main:
.LFB42:
.cfi_startproc
endbr64
subq $24, %rsp
.cfi_def_cfa_offset 32
movl $6, 12(%rsp)
movl 12(%rsp), %ecx
subl $1, %ecx
je .L2
movl $32, %eax
testl $-65536, %ecx
jne .L3
sall $16, %ecx
movl $16, %eax
.L3:
testl $-16777216, %ecx
jne .L4
sall $8, %ecx
subl $8, %eax
.L4:
testl $-268435456, %ecx
jne .L5
sall $4, %ecx
subl $4, %eax
.L5:
testl $-1073741824, %ecx
je .L14
movl %ecx, %edx
movl %eax, %ecx
.L6:
cmpl $-2147483648, %edx
sbbl $0, %ecx
.L2:
movl $1, %edx
leaq .LC0(%rip), %rsi
movl $1, %edi
xorl %eax, %eax
salq %cl, %rdx
call __printf_chk@PLT
xorl %eax, %eax
addq $24, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 8
ret
.L14:
.cfi_restore_state
leal 0(,%rcx,4), %edx
leal -2(%rax), %ecx
jmp .L6
.cfi_endproc
```
常數
```clike=
int main() {
int a = 6;
a = roundup_pow_of_two(a);
printf("%d\n", a);
return 0;
}
```
```clike=
main:
.LFB42:
.cfi_startproc
endbr64
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $8, %edx
movl $1, %edi
xorl %eax, %eax
leaq .LC0(%rip), %rsi
call __printf_chk@PLT
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
```
* 非常數使用 `ilog2()` 裡計算常數 ilog2() 的部份
```clike=
main:
.LFB23:
.cfi_startproc
endbr64
subq $24, %rsp
.cfi_def_cfa_offset 32
xorl %edx, %edx
movl $6, 12(%rsp)
movl 12(%rsp), %eax
cmpl $1, %eax
jle .L2
movl 12(%rsp), %eax
movl $63, %edx
testl %eax, %eax
js .L2
movl 12(%rsp), %eax
movl $62, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $61, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $60, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $59, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $58, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $57, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $56, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $55, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $54, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $53, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $52, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $51, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $50, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $49, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $48, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $47, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $46, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $45, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $44, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $43, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $42, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $41, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $40, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $39, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $38, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $37, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $36, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $35, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $34, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $33, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $32, %edx
testl $-2147483648, %eax
jne .L2
movl 12(%rsp), %eax
movl $31, %edx
testl %eax, %eax
js .L2
movl 12(%rsp), %eax
movl $30, %edx
testl $1073741824, %eax
jne .L2
movl 12(%rsp), %eax
movl $29, %edx
testl $536870912, %eax
jne .L2
movl 12(%rsp), %eax
movl $28, %edx
testl $268435456, %eax
jne .L2
movl 12(%rsp), %eax
movl $27, %edx
testl $134217728, %eax
jne .L2
movl 12(%rsp), %eax
movl $26, %edx
testl $67108864, %eax
jne .L2
movl 12(%rsp), %eax
movl $25, %edx
testl $33554432, %eax
jne .L2
movl 12(%rsp), %eax
movl $24, %edx
testl $16777216, %eax
jne .L2
movl 12(%rsp), %eax
movl $23, %edx
testl $8388608, %eax
jne .L2
movl 12(%rsp), %eax
movl $22, %edx
testl $4194304, %eax
jne .L2
movl 12(%rsp), %eax
movl $21, %edx
testl $2097152, %eax
jne .L2
movl 12(%rsp), %eax
movl $20, %edx
testl $1048576, %eax
jne .L2
movl 12(%rsp), %eax
movl $19, %edx
testl $524288, %eax
jne .L2
movl 12(%rsp), %eax
movl $18, %edx
testl $262144, %eax
jne .L2
movl 12(%rsp), %eax
movl $17, %edx
testl $131072, %eax
jne .L2
movl 12(%rsp), %eax
movl $16, %edx
testl $65536, %eax
jne .L2
movl 12(%rsp), %eax
movl $15, %edx
testb $-128, %ah
jne .L2
movl 12(%rsp), %eax
movl $14, %edx
testb $64, %ah
jne .L2
movl 12(%rsp), %eax
movl $13, %edx
testb $32, %ah
jne .L2
movl 12(%rsp), %eax
movl $12, %edx
testb $16, %ah
jne .L2
movl 12(%rsp), %eax
movl $11, %edx
testb $8, %ah
jne .L2
movl 12(%rsp), %eax
movl $10, %edx
testb $4, %ah
jne .L2
movl 12(%rsp), %eax
movl $9, %edx
testb $2, %ah
jne .L2
movl 12(%rsp), %eax
movl $8, %edx
testb $1, %ah
jne .L2
movl 12(%rsp), %eax
movl $7, %edx
testb $-128, %al
jne .L2
movl 12(%rsp), %eax
movl $6, %edx
testb $64, %al
jne .L2
movl 12(%rsp), %eax
movl $5, %edx
testb $32, %al
jne .L2
movl 12(%rsp), %eax
movl $4, %edx
testb $16, %al
jne .L2
movl 12(%rsp), %eax
movl $3, %edx
testb $8, %al
jne .L2
movl 12(%rsp), %eax
andl $4, %eax
cmpl $1, %eax
sbbl %edx, %edx
addl $2, %edx
```
```cpp
#define rounddown_pow_of_two(n) \
( \
__builtin_constant_p(n) ? ( \
(1UL << ilog2(n))) : \
__rounddown_pow_of_two(n) \
)
#endif /* _TOOLS_LINUX_LOG2_H */
static inline __attribute__((const))
unsigned long __rounddown_pow_of_two(unsigned long n)
{
return 1UL << (fls_long(n) - 1);
}
```
`__rounddown_pow_of_two` 的方式與 up 類似,不過因為要找到小於等於 N 的 power-of-two ,所以在 `fls_long` 之後要先減1再進行左移,1UL 的1移到的位置才會剛好是 N 的 last set bit 。
```clike=
static inline unsigned fls_long(unsigned long l)
{
if (sizeof(l) == 4)
return fls(l);
return fls64(l);
}
static inline __attribute__ ((const)) int fls(unsigned int x)
{
if (__builtin_constant_p(x))
return constant_fls(x);
return 32 - clz(x);
}
static inline int constant_fls(unsigned int x)
{
int r = 32;
if (!x)
return 0;
if (!(x & 0xffff0000u)) {
x <<= 16;
r -= 16;
}
if (!(x & 0xff000000u)) {
x <<= 8;
r -= 8;
}
if (!(x & 0xf0000000u)) {
x <<= 4;
r -= 4;
}
if (!(x & 0xc0000000u)) {
x <<= 2;
r -= 2;
}
if (!(x & 0x80000000u))
r -= 1;
return r;
}
```
### slab
:::info
[Linux 核心設計: 記憶體管理](https://hackmd.io/t_SzlKdqQKGk7VFYnZ2NKQ?both)
* 所謂的 slab 是從 buddy sysytem 要一個 page 過來,之後依照不同的 kmalloc 所丟入不同的 object size (例如 kmalloc-96, kmalloc-192, kmalloc-8, kmalloc-32 等等),把 page 內部切成 object size 為單位的格式
* 之後 return freelist 的第一個 free 的 object 給 kmalloc 的呼叫者
* 其中,初始化過後,每個 object 的頭 8 個 bytes 儲存 freepointer 指向下一個可得的 available object size。用 linked list 的方式串起整個 slab(page) 內部所有空的 objects
---
[The Slab Allocator: An Object-Caching Kernel Memory Allocator](https://people.eecs.berkeley.edu/~kubitron/courses/cs194-24-S14/hand-outs/bonwick_slab.pdf)
在這篇文章中有提到使用 slab 的原因,主要是因為如果是使用舊的方式每次要新的物件時就要重新建立並且在結束時刪除空間就會產生許多時間上的損失,如果執行 slab 大部分的時間只用於 memmory allocation ,所需的時間成本就如圖中會明顯的減少而造成效能的增進
![](https://i.imgur.com/bdS0lU1.png)
---
[SLAB per frame freelist management](https://events.static.linuxfound.org/sites/events/files/slides/slaballocators.pdf)
![](https://i.imgur.com/4r8Ctog.png)
:::
* 在 `kmalloc` 透過傳入 `size` 大小找到符合最小的 cache 空間,再利用所回傳的 `index` 去找是否還有可使用的空間
* 在 `slab.h` 中以使用到 `power_of_2` 在 `kmalloc_index` 的函式中, `kmalloc_index` 是用來查找符合 `size` 的最小的 cache 的大小,而其所使用的方式就是類似 `roundup_pow_of_two` 的方式,而這邊是直接將其所有可能皆列出來
:::info
問題 :
是否有更有效的方式實作,如果使用類似 `fls()` 直接找到最高有效位元的前一位也可以找到所要的 index ,如果每次都需比較這麼多次,是否會影響效能?
:::
```clike=
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
#ifndef CONFIG_SLOB
unsigned int index;
#endif
if (size > KMALLOC_MAX_CACHE_SIZE)
return kmalloc_large(size, flags);
#ifndef CONFIG_SLOB
index = kmalloc_index(size);
if (!index)
return ZERO_SIZE_PTR;
return kmem_cache_alloc_trace(
kmalloc_caches[kmalloc_type(flags)][index],
flags, size);
#endif
}
return __kmalloc(size, flags);
}
static __always_inline unsigned int kmalloc_index(size_t size)
{
if (!size)
return 0;
if (size <= KMALLOC_MIN_SIZE)
return KMALLOC_SHIFT_LOW;
if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
return 1;
if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
return 2;
if (size <= 8) return 3;
if (size <= 16) return 4;
if (size <= 32) return 5;
if (size <= 64) return 6;
if (size <= 128) return 7;
if (size <= 256) return 8;
if (size <= 512) return 9;
if (size <= 1024) return 10;
if (size <= 2 * 1024) return 11;
if (size <= 4 * 1024) return 12;
if (size <= 8 * 1024) return 13;
if (size <= 16 * 1024) return 14;
if (size <= 32 * 1024) return 15;
if (size <= 64 * 1024) return 16;
if (size <= 128 * 1024) return 17;
if (size <= 256 * 1024) return 18;
if (size <= 512 * 1024) return 19;
if (size <= 1024 * 1024) return 20;
if (size <= 2 * 1024 * 1024) return 21;
if (size <= 4 * 1024 * 1024) return 22;
if (size <= 8 * 1024 * 1024) return 23;
if (size <= 16 * 1024 * 1024) return 24;
if (size <= 32 * 1024 * 1024) return 25;
if (size <= 64 * 1024 * 1024) return 26;
BUG();
/* Will never be reached. Needed because the compiler may complain */
return -1;
}
```
---
## Q3
```clike=
#include <stdint.h>
void bitcpy(void *_dest, /* Address of the buffer to write to */
size_t _write, /* Bit offset to start writing to */
const void *_src, /* Address of the buffer to read from */
size_t _read, /* Bit offset to start reading from */
size_t count)
{
size_t read_lhs = _read & 7;
size_t read_rhs = 8 - read_lhs;
const uint8_t *source = (const uint8_t *) _src + (_read / 8);
size_t write_lhs = _write & 7;
size_t write_rhs = 8 - write_lhs;
uint8_t *dest = (uint8_t *) _dest + (_write / 8);
static const uint8_t read_mask[] = {
0x00, /* == 0 00000000b */
0x80, /* == 1 10000000b */
0xC0, /* == 2 11000000b */
0xE0, /* == 3 11100000b */
0xF0, /* == 4 11110000b */
0xF8, /* == 5 11111000b */
0xFC, /* == 6 11111100b */
0xFE, /* == 7 11111110b */
0xFF /* == 8 11111111b */
};
static const uint8_t write_mask[] = {
0xFF, /* == 0 11111111b */
0x7F, /* == 1 01111111b */
0x3F, /* == 2 00111111b */
0x1F, /* == 3 00011111b */
0x0F, /* == 4 00001111b */
0x07, /* == 5 00000111b */
0x03, /* == 6 00000011b */
0x01, /* == 7 00000001b */
0x00 /* == 8 00000000b */
};
while (count > 0) {
uint8_t data = *source++;
size_t bitsize = (count > 8) ? 8 : count;
if (read_lhs > 0) {
RRR;
if (bitsize > read_rhs)
data |= (*source >> read_rhs);
}
if (bitsize < 8)
data &= read_mask[bitsize];
uint8_t original = *dest;
uint8_t mask = read_mask[write_lhs];
if (bitsize > write_rhs) {
/* Cross multiple bytes */
*dest++ = (original & mask) | (data >> write_lhs);
original = *dest & write_mask[bitsize - write_rhs];
*dest = original | (data << write_rhs);
} else {
// Since write_lhs + bitsize is never >= 8, no out-of-bound access.
DDD;
*dest++ = (original & mask) | (data >> write_lhs);
}
count -= bitsize;
}
}
```
* 9-10行中, `_read & 7` 作用等同於 `_read % 8` ,因此 `read_lhs` 可求出開始讀取的第一個位元在該位元組的位置, 而 `read_rhs` 是透過 `8 - read_lhs` 是求出該位元組所需讀取的位元數
* 11行中,找到開始讀取第一個位元的位元組,透過 `_read / 8` 即可找出須位移的 byte 數
* 12-14行,與 read 相同的方式找到 write 的數值
* `RRR` 為 `data <<= read_lhs` ,因為 data 該位元組只從第`read_lhs` 個 bit 開始讀取因此透過左移來找到第一個 bit 開始的 `bitsize` 個 bit ,在45-46行中,如果要取的 bit 數超過該位元組從下一個位元組取前面 `read_rhs` 個 bit 數
* 49-50行,如果該次所需取的 bit 數不到8個,要將該次所取的位元組多餘的 bit 設為 0 ,因此利用 mask 的方式將 bitsize 之後的其他 bit 皆設為 0
* `original & mask` 將該位元組的前 k 個 bit 設為 0(第 k 個開始寫入, k 為 `_write` 所設定的),`data >> write_lhs` 則將 data 右移使之對齊該位元組開始的 bit , 最後將 dest 從第 k 個開始填入 data 資料,在58行中將多餘的 bit 填入下一個位元組
* `DDD` 為 `mask |= write_mask[write_lhs + bitsize]`
* 61-62行中,則是如果 bitsize 要存入的 bit_size 不會填滿該位元組,要分別將左邊以及右邊多餘的 bit 皆設為 0
---
## Q4
```clike=
enum {
CSTR_PERMANENT = 1,
CSTR_INTERNING = 2,
CSTR_ONSTACK = 4,
};
#define CSTR_INTERNING_SIZE (32)
#define CSTR_STACK_SIZE (128)
```
* 透過 hash_size 字串長度來進行比對,相較於直接用字串比對快速
* ref 為 reference counting 用來計算多少同樣字串連結
```clike=
typedef struct __cstr_data {
char *cstr;
uint32_t hash_size;
uint16_t type;
uint16_t ref;
} * cstring;
```
* ##前置處理器應用
* 初始化 `var` 的字串成 `__cstr_data` ,將其放入 stack 中
```clike=
#define CSTR_BUFFER(var) \
char var##_cstring[CSTR_STACK_SIZE] = {0}; \
struct __cstr_data var##_cstr_data = {var##_cstring, 0, CSTR_ONSTACK, 0}; \
cstr_buffer var; \
var->str = &var##_cstr_data;
```
* 先將 cstr 透過 `cstr_clone` 複製一份給 tmp
* `__sync_bool_compare_and_swap` 則是先比較 `&var` 和 `NULL` 如果想等則會使 `&var = tmp`
*
```clike=
#define CSTR_LITERAL(var, cstr) \
static cstring var = NULL; \
if (!var) { \
cstring tmp = cstr_clone("" cstr, (sizeof(cstr) / sizeof(char)) - 1); \
if (tmp->type == 0) { \
tmp->type = CSTR_PERMANENT; \
tmp->ref = 0; \
} \
if (!__sync_bool_compare_and_swap(&var, NULL, tmp)) { \
if (tmp->type == CSTR_PERMANENT) \
free(tmp); \
} \
}
```
```clike=
#define CSTR_CLOSE(var) \
do { \
if (!(var)->str->type) \
cstr_release((var)->str); \
} while (0)
```
* CCC 為 s->hash_size
* 先判定 `sb` 裡面的字串是否在 stack 裡,如果是則將字串接在 `sb` 的字串後面
```clike=
cstring cstr_cat(cstr_buffer sb, const char *str)
{
cstring s = sb->str;
if (s->type == CSTR_ONSTACK) {
int i = CCC;
while (i < CSTR_STACK_SIZE - 1) {
s->cstr[i] = *str;
if (*str == 0)
return s;
++s->hash_size;
++str;
++i;
}
s->cstr[i] = 0;
}
cstring tmp = s;
sb->str = cstr_cat2(tmp->cstr, str);
cstr_release(tmp);
return sb->str;
}
```
* 會將傳進來的字串的長度相加確定是否超出 interning 的長度,如果沒有超出則進行 interning,如果超過則直接回傳一個 cstring
```clike=
static cstring cstr_cat2(const char *a, const char *b)
{
size_t sa = strlen(a), sb = strlen(b);
if (sa + sb < CSTR_INTERNING_SIZE) {
char tmp[CSTR_INTERNING_SIZE];
memcpy(tmp, a, sa);
memcpy(tmp + sa, b, sb);
tmp[sa + sb] = 0;
return cstr_interning(tmp, sa + sb, hash_blob(tmp, sa + sb));
}
cstring p = xalloc(sizeof(struct __cstr_data) + sa + sb + 1);
if (!p)
return NULL;
char *ptr = (char *) (p + 1);
p->cstr = ptr;
p->type = 0;
p->ref = 1;
memcpy(ptr, a, sa);
memcpy(ptr + sa, b, sb);
ptr[sa + sb] = 0;
p->hash_size = 0;
return p;
}
```
* 如果兩個 cstring 是 interning 的話,那他們會指向同一個位置
* 如果是在 stack 上,則先看 hash_size 來判定兩個字串長度是否相同,如果不相同則必為不同,如果相同則要比較兩個字串內容是否相同
* 如果皆非上面兩種則在使用雜湊函數來進行比對
* 最後如果上述都不成立則在直接兩個字串的長短
:::info
不確定使用雜湊函數來進行判斷的用意,可能為相較於比較字串長短速度較快,而且如果是在 stack 上的字串在 hash 裡面還會被判斷一次增加效能負擔
:::
```clike=
int cstr_equal(cstring a, cstring b)
{
if (a == b)
return 1;
if ((a->type == CSTR_INTERNING) && (b->type == CSTR_INTERNING))
return 0;
if ((a->type == CSTR_ONSTACK) && (b->type == CSTR_ONSTACK)) {
if (a->hash_size != b->hash_size)
return 0;
return memcmp(a->cstr, b->cstr, a->hash_size) == 0;
}
uint32_t hasha = cstr_hash(a);
uint32_t hashb = cstr_hash(b);
if (hasha != hashb)
return 0;
return !strcmp(a->cstr, b->cstr);
}
```
* 如果在 stack 上,則利用 `hash_size` 進行雜湊,不過放在 stack 上字串在 `cat2` 可是 `cat2` 裡已經有透過 `hash_size` 判斷長度因此這邊可以進行修改來增進效能
* 如果非 stack 則是透過字串長度來進行雜湊
```clike=
static uint32_t cstr_hash(cstring s)
{
if (s->type == CSTR_ONSTACK)
return hash_blob(s->cstr, s->hash_size);
if (s->hash_size == 0)
s->hash_size = hash_blob(s->cstr, strlen(s->cstr));
return s->hash_size;
}
static inline uint32_t hash_blob(const char *buffer, size_t len)
{
const uint8_t *ptr = (const uint8_t *) buffer;
size_t h = len;
size_t step = (len >> 5) + 1;
for (size_t i = len; i >= step; i -= step)
h = h ^ ((h << 5) + (h >> 2) + ptr[i - 1]);
return h == 0 ? 1 : h;
}
```
* 先確定 `cstring` 是否還在 interning 或 stack 中,並且確定 reference count 是否為0
* `__sync_sub_and_fetch` 會將 `&s->ref` 先減 1 ,然後才進行 free
```clike=
void cstr_release(cstring s)
{
if (s->type || !s->ref)
return;
if (__sync_sub_and_fetch(&s->ref, 1) == 0)
free(s);
}
```