Try   HackMD

學習 Linux 流水筆記

大部分材料都來自於 Linux 核心設計/實作 (Linux Kernel Internals), 你所不知道的 C 語言

前篇:link

Perf

performance event

背景執行

./<> &
perf top -p $pid

-e <event>
perf top -e cache-misses

-r <n>
-r 5

Valgrind

user manual

Cachegrind

Cachegrind doc

item description
I1 first-level instruction cache
D1 first-level data cache
L2 second-level cache
LL last-level cache
mr read misses
mw write misses
Bc conditional branches executed
Bcm conditional branches mispredicted
Bi indirect branches executed
Bim indirect branches mispredicted

函式呼叫篇

memory layout

  • stack: stack frame 的 local variables 與 return address 等
  • heap: 動態配置的,它只是一個 memory pool ,而不是 "heap"
  • BSS segement: 尚未初始化的變數
  • Data segement: 已經被初始化後的變數
  • Text segement: 存放使用者程式的 binary code

x86 stack

  • rip (instruction pointer): 記錄下個要執行的指令位址
  • rsp (stack pointer): 指向 stack 頂端
  • rbp (base pointer, frame pointer): 指向 stack 底部

前置處理器

array size

數值系統

DETECT_NULL()

Bitwise

C 語言標準並未定義 >> 右移為算術位移或邏輯位移

  • gcc 實作上使用算術位移

<< 則固定為邏輯位移

記憶體管理與對齊

在 x86 Linux 上做一些小實驗:

stack and heap 觀察

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

int main()
{
    unsigned char *p = NULL;
    /* stack */
    uint64_t s = 0x1234567890ABCDEF;
    p = (unsigned char *) &s;
    for (int i = 0; i < 8; i++)
        printf("%X is located in %p\n", p[i], p + i);
    printf("*(uint64_t *)%p is %lX\n", (uint64_t *)(((intptr_t)&s) - 1), *(uint64_t *)(((intptr_t)&s) - 1));
    printf("*(uint64_t *)%p is %lX\n", &s, *&s);
    printf("*(uint64_t *)%p is %016lX\n", (uint64_t *)(((intptr_t)&s) + 1), *(uint64_t *)(((intptr_t)&s) + 1));
    printf("*(uint32_t *)%p is %X\n", &s, *(uint32_t *)&s);
    printf("*(uint32_t *)%p is %X\n", (uint32_t *)(((intptr_t)&s) + 4), *(uint32_t *)(((intptr_t)&s) + 4));
    putchar('\n');
    /* heap */
    uint64_t *h = (uint64_t *) malloc(sizeof(uint64_t));
    *h = 0x1234567890ABCDEF;
    p = (unsigned char *) h;
    for (int i = 0; i < 8; i++)
        printf("%X is located in %p\n", p[i], p + i);
    printf("*(uint64_t *)%p is %lX\n", (uint64_t *)(((intptr_t)h) - 1), *(uint64_t *)(((intptr_t)h) - 1));
    printf("*(uint64_t *)%p is %lX\n", h, *h);
    printf("*(uint64_t *)%p is %016lX\n", (uint64_t *)(((intptr_t)h) + 1), *(uint64_t *)(((intptr_t)h) + 1));
    printf("*(uint32_t *)%p is %X\n", h, *(uint32_t *)h);
    printf("*(uint32_t *)%p is %X\n", (uint32_t *)(((intptr_t)h) + 4), *(uint32_t *)(((intptr_t)h) + 4));
    free(h);
    return 0;
}
$ ./layout
EF is located in 0x7ffc10ffd870
CD is located in 0x7ffc10ffd871
AB is located in 0x7ffc10ffd872
90 is located in 0x7ffc10ffd873
78 is located in 0x7ffc10ffd874
56 is located in 0x7ffc10ffd875
34 is located in 0x7ffc10ffd876
12 is located in 0x7ffc10ffd877
*(uint64_t *)0x7ffc10ffd86f is 34567890ABCDEF00
*(uint64_t *)0x7ffc10ffd870 is 1234567890ABCDEF
*(uint64_t *)0x7ffc10ffd871 is 701234567890ABCD
*(uint32_t *)0x7ffc10ffd870 is 90ABCDEF
*(uint32_t *)0x7ffc10ffd874 is 12345678

EF is located in 0x558c0407d6b0
CD is located in 0x558c0407d6b1
AB is located in 0x558c0407d6b2
90 is located in 0x558c0407d6b3
78 is located in 0x558c0407d6b4
56 is located in 0x558c0407d6b5
34 is located in 0x558c0407d6b6
12 is located in 0x558c0407d6b7
*(uint64_t *)0x558c0407d6af is 34567890ABCDEF00
*(uint64_t *)0x558c0407d6b0 is 1234567890ABCDEF
*(uint64_t *)0x558c0407d6b1 is 001234567890ABCD
*(uint32_t *)0x558c0407d6b0 is 90ABCDEF
*(uint32_t *)0x558c0407d6b4 is 12345678

其 memory contents 為:

(high address)
12
34
56
78
90
AB
CD
EF
(stack top)
...
...
...
(heap)
12
34
56
78
90
AB
CD
EF
(low address)

指標實驗

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

typedef struct {
    int n;
} struct_int_t;

int main()
{
    unsigned char *p = (unsigned char *) malloc(sizeof(unsigned char) * 8); // at heap
    p[0] = 0x88;
    printf("%X is located in %p\n", p[0], p);
    for (int i = 1; i < 8; i++) {
        p[i] = p[i - 1] + 0x11;
        printf("%X is located in %p\n", p[i], p + i);
    }
    printf("*(int *)%p is %X\n", p, *(int *)p);
    printf("*(int *)%p is %X\n", p + 1, *(int *)(p + 1));
    printf("((struct_int_t *)%p)->n is %X\n", p, ((struct_int_t *)p)->n);
    printf("((struct_int_t *)%p)->n is %X\n", p + 1, ((struct_int_t *)(p + 1))->n);
    free(p);
    return 0;
}
$ ./align2
88 is located in 0x55de67c762a0
99 is located in 0x55de67c762a1
AA is located in 0x55de67c762a2
BB is located in 0x55de67c762a3
CC is located in 0x55de67c762a4
DD is located in 0x55de67c762a5
EE is located in 0x55de67c762a6
FF is located in 0x55de67c762a7
*(int *)0x55de67c762a0 is BBAA9988
*(int *)0x55de67c762a1 is CCBBAA99
((struct_int_t *)0x55de67c762a0)->n is BBAA9988
((struct_int_t *)0x55de67c762a1)->n is CCBBAA99

bit field

struct foo {
    int a : 3;
    int b : 2;
    int : 0; /* Force alignment to next boundary */
    int c : 4;
    int d : 3;
};

使用 : 宣告要使用的 bit 數, unamed 0 會強制 alignment 。

when does compiler reorder my struct members?

為了避免需要 alignment 使得程式執行速度降低,所以編譯器在配置空間時常常會做 padding ,結構成員的順序也可能被移動

使用 #pragma pack 一類的預處理指令可以禁止編譯器做這些更動


alloc() 配置於 stack

  • do not free() !

mlockall() :禁止所有 memory 被 swap out

Linux Torvalds 反對 variable length array 的使用

how to measure page fault?


copy-on-write

To read:

C 語言技巧篇

attribute 相關 doc : https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html

cleanup :

The cleanup attribute runs a function when the variable goes out of scope. This attribute can only be applied to auto function scope variables; it may not be applied to parameters or variables with static storage duration. The function must take one parameter, a pointer to a type compatible with the variable. The return value of the function (if any) is ignored.

寫出適用於 CPU, GPU 的程式

  • stack : 離開 scope 自動返還
  • heap : malloc() ,需要 free()

不要使用 strncpy() ,以 strncpy() 代替, sstrncpy() 更高效且安全。

  • assert() issues an error at execution-time
  • static_assert() issues an error at compile-time

unamed field doc

transparent union

為什麼 container_of() 巨集需要宣告 pmember ,不能直接使用 (char *) ptr - offset 嗎?

#define container_of(ptr, type, member) \ __extension__({ \ const __typeof__(((type *) 0)->member) *__pmember = (ptr); \ (type *) ((char *) __pmember - offsetof(type, member)); \ })

為了避免 side effect

To read:

C 編譯器

https://www.slideshare.net/jserv/how-a-compiler-works-gnu-toolchain

  • Basic Blocks (BB)
    • 單一進入點和單一出口點的段落
  • Control Flow Graph (CFG)
  • Intermediate Representation (IR)
    • 更容易支援各種前後端的方式
    • Gimple for gcc
  • Compilation Unit

Static Single Assignment

每個被 assign 的變數只會出現一次

  • constant propagation

進階議題

  • loop optimization
  • inter procedure optimization
  • auto vectorization
  • auto parallelization

跨平台多執行緒 API : OpenMP, C++ AMP

編譯器難以 inline 外部函式,除非有 link time optimization 。

To read:

gcc 參數

Let out be the name of output file.

輸出包含 BB 的 CFG

gcc -c -fdump-tree-cfg=out *.c

輸出最佳化的 BB

gcc -Os -c -fdump-tree-optimized=out *.c

OO

使用 diagram 來幫助了解物件之間的互動關係

To read:

動態連結器

針對 glibc 的解說

設定 LD_PRELOAD 可以讓 glibc 的 dynamic linker 在 load 和 relocate libc.so 之前載入我們撰寫的動態連結函式庫。

ld.so, ld-linux.so

To read:

GNU extension

Macro with ({ }) is called statement expression.


並行程式設計

  • mutex
    • lock()
    • unlock()
  • semaphore
    • signal()
    • wait()

process 與 thread 差異取決於作業系統如何看待

To read:

Sequenced Before

Sequecedbefore 是種對同一個執行緒下,求值順序關係的描述。
Sequenced before 在數學上是一種 asymmetric 和 transitive 的 relation 。

如果每個處理器都有自己的 write buffer 並對 program 進行最佳化,就很可能違反 sequential consistency 。

Happens Before

要求「看起來」的結果

Memory Consistency Models

  • sequentially consistence : 所有順序保證
  • usually strong :
  • weak with data dependency ordering : load 指令的順序保證
  • weak : 所有順序都不保證

To read:

POSIX

https://hpc-tutorials.llnl.gov/posix/

In Unix enviroment, a thread

  • exists within a process and uses the process resources
  • has its own independent flow of control
  • duplicates only the essential resources it needs to be independently schedulable
  • may share the process resources with other threads that act equally independently (and dependently)
  • dies if the parent process dies
  • is “lightweight” because most of the overhead has already been accomplished through the creation of its process

Threads reading and writing to the same memory locations is possible, and therefore requires explicit synchronization by the programmer.

pthread

<pthread.h>

compile with gcc argument -pthread

Some APIs :

  • pthread_create()
  • pthread_exit()
  • pthread_cancel()
create
int pthread_create(pthread_t *restrict thread,
        const pthread_attr_t *restrict attr,
        void *(*start_routine)(void*), void *restrict arg);

thread : An unique identifier
attr : attribute, we may specify NULL for default values
start_routine : will be executed once the thread is created
arg : single argument that may be passed to start_routine

exit
void pthread_exit(void *retval);

There are several ways in which a thread may be terminated :

  • The thread returns normally from its starting routine. Its work is done.
  • The thread makes a call to the pthread_exit().
  • The thread is canceled by another thread via pthread_cancel().
  • The entire process is terminated.
  • main() finishes.

By having main() explicitly call pthread_exit() as the last thing it does, main() will block and be kept alive to support the threads it created until they are done.

Joining and Detaching

When a thread is created, one of its attributes defines whether it is joinable or detached. Only threads that are created as joinable can be joined. If a thread is created as detached, it can never be joined.

The pthread_detach() routine can be used to explicitly detach a thread even though it was created as joinable.

For portability, create explicitly as joinable or detachable.

mutex

  • PTHREAD_MUTEX_INITIALIZER
  • pthread_mutex_init()
  • pthread_mutex_unlock()
  • pthread_mutex_destroy()
    • destroy a locked mutex is undefined behavior

To read:


System call

user-mode 的程式藉由 system call 來向 os 要資源

  • strace 追蹤 system call
  • ltrace 追蹤 library call

Linux 核心 Scheduler

  • scheduling goal
  • completely fair
  • real-time

scheduling 的時候把相關的 process 視為一個 group 一起排

測量 scheduling 的成本:

$ taskset -c 0 perf bench sched pipe
# Running 'sched/pipe' benchmark:
# Executed 1000000 pipe operations between two processes

     Total time: 5.072 [sec]

       5.072295 usecs/op
         197149 ops/sec

$ numactl --hardware
available: 1 nodes (0)
node 0 cpus: 0 1 2 3 4 5 6 7
node 0 size: 7833 MB
node 0 free: 577 MB
node distances:
node   0 
  0:  10 

RB tree 在 worst case 的表現優於 AVL tree

kernel 中會有 disable preemption 的區域

Timer

https://www.youtube.com/watch?v=Puv4mW55bF8&t=332s

在 kernel 中計時的方法本質上就是一個 counter,有一個固定的頻率,週期乘以計數器的差值就可以取到時間。

用 struct clocksource 來抽象化 hardware counter

POSIX clocks:

  • CLOCK_BOOTTIME
    • 無論是 suspend 或什麼情況,都會一直 increasing 的計數器
  • CLOCK_MONOTONIC
    • suspend 時會跟著暫停
  • CLOCK_REALTIME
    • wall time
  • CLOCK_TAI

kernel doc

在沒有 hardware counter 的系統上引用 GPS,jiffies,雖然不會有 nanosecond resolution,但勉強能用。

抽象成 struct clock_event_device


https://hackmd.io/@sysprog/linux-timer

kernel 藉著 HZ 變數來算時間,而 tickless kernel 引進 NO_HZ 的全新機制。

The tick rate is a periodic interrupt, HZ.

  • wall time, important for user-space application
  • system uptime, important for both kernel and user mode

The value of HZ is architecture dependent. 該值在編譯時期就要決定。

larger HZ 的優點:

  • kernel 時間更準
  • 各種量測的 resolution 更佳
  • preemption 的時間更 accurate

缺點:

  • 更大的 overhead
  • thrash cashe and increase in power consumption

fire the timer 觸發計時器

tickless

  • tickless is not tick-free
  • 又名 dyntick (dynamic tick)

越精準的的 timer 越耗電,因此系統可能提供很多組 timer,有些 resolution 偏低但是可以省電。

系統休眠的時候改用 tickless 機制來省電

tickless 需要使用 RCU

cputime counting:

  • utime for user-space
  • stime for kernel-space

jiffies

<linux/jiffies.h>

The global variable jiffies holds the number of ticks that have occured since the system booted. On boot, the kernel initializes the variable to zero, and it is incremented by one during each timer interrupt.

jiffies 在 32-bit 上是 u32,在 64-bit 上是 u64。

每秒有 HZ 個 timer interrupt,因此 system uptime 就是 jiffies

÷ HZ (second)。

基本上 (在一般的 HZ value 之下) 64-bit 上的 jiffies 不會在人類的生命時長裡 overflow。

Clock and timer

變數 xtime 紀錄現在的時間和日期

timer interrupt 分成:

  • interrupt handler
    • architecture dependent
    1. obtain xtime_lock
    2. acknowledge or reset the system timer
    3. save the updated wall time to the real time clock
    4. call tick_periodic()
  • tick_periodic()
    • architecture independent
    1. increment jiffies_64 by one
    2. update resource usages for currently running process
    3. execute scheduler_tick()
    4. update wall time in xtime

RCU

kernel doc

mm

bootmem

系統初始化時所使用的 mm 機制

buddy system

分成一些

2N 大小的記憶體區塊,需要時直接去取用。

$ cat buddyinfo 
Node 0, zone      DMA      0      0      0      0      0      0      0      0      0      1      3 
Node 0, zone    DMA32   3092   1019    525    340    263    177     88     26     10      3      0 
Node 0, zone   Normal    561    574    983    494    197     66     13      2      8      7      2 

buddy system 有 fragmentation 的問題

slab

使用 list 來管理 pages,分成 3 條 lists

  • full
  • partial
  • empty

slab 會一次拿很多 page,然後把 page 切分成很多小塊等著被 allocate。

在多 CPU、NUMA 的情境非常劣勢

slub

File System

ABI:application binary interface
VFS:virtual file system
FHS:filesystem hierachy

  • virtual 表示某程度上模擬真實
  • pseudo 則有不是真的的意思

user-space 可以藉由修改 /proc pseudo file system 來在執行時期變更 kernel 行為

  • inode is the metadata for one file
  • dentry is the file name to inode mapping, containing
    • file name
    • a link to an inode
    • a parent pointer (類似 ..
  • file is a file handle referring to a dentry and a cursor (offset) in the file