# Linux Kernel - slub allocator > Author : 堇姬 Naup https://blog.csdn.net/Breeze_CAT/article/details/130015313 ## layout    Kmem cache cpu -> slab freelist -> kmem cache cpu partial -> kmem cache node ## 結構體 根據上圖,我們可以知道整個 slab allocator 最上層是 `slab_caches` 他是 `kmem_cache` struct double linklist 的 entry https://elixir.bootlin.com/linux/v6.15-rc5/source/mm/slab.h#L382 ```c /* The list of all slab caches on the system */ extern struct list_head slab_caches; ``` kmem_cache 是 Linux 核心中的一種 slab cache 物件管理器。 它負責管理一種特定資料結構類型或固定大小記憶體區塊的 slab 反正每個都是一種 memory pool 可以用 `cat /proc/slabinfo` 來看 可以看到很多不同結構體或大小的 memory pool 這就是他的 management https://elixir.bootlin.com/linux/v6.15-rc5/source/mm/slab.h#L258 ```c /* * Slab cache management. */ struct kmem_cache { #ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; #endif /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; unsigned int size; /* Object size including metadata */ unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; /* Number of per cpu partial slabs to keep around */ unsigned int cpu_partial_slabs; #endif struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ struct kmem_cache_order_objects min; gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ void (*ctor)(void *object); /* Object constructor */ unsigned int inuse; /* Offset to metadata */ unsigned int align; /* Alignment */ unsigned int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif #ifdef CONFIG_SLAB_FREELIST_HARDENED unsigned long random; #endif #ifdef CONFIG_NUMA /* * Defragmentation by allocating from a remote node. */ unsigned int remote_node_defrag_ratio; #endif #ifdef CONFIG_SLAB_FREELIST_RANDOM unsigned int *random_seq; #endif #ifdef CONFIG_KASAN_GENERIC struct kasan_cache kasan_info; #endif #ifdef CONFIG_HARDENED_USERCOPY unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ #endif struct kmem_cache_node *node[MAX_NUMNODES]; }; ``` * cpu_slab:一個每 CPU 的變數(per-cpu variable),對每個 CPU 而言,相當於一個本地的記憶體快取池。在分配記憶體時會優先從本地 CPU 分配,以提高快取命中率。 * flags:object 的分配掩碼,例如常見的 SLAB_HWCACHE_ALIGN 標誌,表示 kmem_cache 管理的物件會按照硬體快取對齊,以提升效能。 * min_partial:限制 struct kmem_cache_node 中 partial 鏈結串列中 slab 的數量。雖然叫 min_partial,但實際上代表的是「最大」允許的 slab 數量;若超過這個值,多餘的 slab 會被釋放。 * size:實際分配的物件大小,通常會比原始物件大,因為需要對齊與管理用的額外空間。 * object_size:原始物件的實際大小,是建立 kmem_cache 時傳入的參數。size 會 ≥ object_size,因為 size 包含了對齊與內部用途的額外空間。 * offset:SLUB 在管理 object 時的設計巧妙地利用尚未被使用的 object 空間來存儲「下一個 free object」的指標,形成單向鏈結串列。這個 offset 就是該指標相對於 object 起始位址的偏移量。 * cpu_partial:每 CPU partial 區域中,所有 slab 的 free object 數量上限。超過此數量後,這些 slab 會被轉移到 kmem_cache_node 的 partial 鏈結串列中。 * oo:低 16 位表示一個 slab 中 object 的數量(oo & ((1 << 16) - 1)),高 16 位表示該 slab 使用的 page 數((2^(oo >> 16)) 頁)。 * max:從原始碼看來,它通常等於 oo。 * min:當按 oo 的大小分配記憶體時遇到不足,會退而求其次採用 min 所代表的較小配置方式。min 最少能容納一個 object 即可。 * allocflags:從夥伴系統(buddy system)分配記憶體時使用的掩碼(flags)。 * inuse:object_size 經過 word 對齊後的大小。 * align:記憶體對齊的邊界(以 byte 為單位)。 * name:此 kmem_cache 的名稱,供 sysfs 檔案系統顯示使用。 * list:系統中存在一個 slab_caches 鏈結串列,所有 kmem_cache 實例都會掛載到此串列中。 * node:slab 所屬的節點(NUMA 架構相關)。在 NUMA 系統中,每個 node 都有對應的 struct kmem_cache_node 結構體。 總之比較重要的就是 `struct kmem_cache_cpu __percpu *cpu_slab;` 跟 `struct kmem_cache_node *node[MAX_NUMNODES];` `struct kmem_cache_cpu __percpu *cpu_slab;` 是一個 pointer,指向 kmem_cache_cpu,他是用來描述 CPU 本地的 memory pool 每個 CPU 對應一個這個結構 https://elixir.bootlin.com/linux/v5.19.17/source/include/linux/slub_def.h#L48 ```c /* * When changing the layout, make sure freelist and tid are still compatible * with this_cpu_cmpxchg_double() alignment requirements. */ struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ struct slab *slab; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated frozen slabs */ #endif local_lock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; #endif }; ``` https://elixir.bootlin.com/linux/v6.15-rc5/source/mm/slub.c#L383 ```c struct kmem_cache_cpu { union { struct { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ }; freelist_aba_t freelist_tid; }; struct slab *slab; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ #endif local_lock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif }; ``` 這部分跟上述不太一樣,上述圖片是基於 5.x 的 slab allocator 所繪製的,其部分已經被 patch 成這樣 https://lore.kernel.org/all/20211201181510.18784-4-vbabka%40suse.cz/T/ ```c diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 1ef68d4de9c0..00d99afe1c0e 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -48,9 +48,9 @@ enum stat_item { struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ - struct page *page; /* The slab from which we are allocating */ + struct slab *slab; /* The slab from which we are allocating */ #ifdef CONFIG_SLUB_CPU_PARTIAL - struct page *partial; /* Partially allocated frozen slabs */ + struct slab *partial; /* Partially allocated frozen slabs */ #endif ``` 這邊先專注在 5.x 上 `freelist` 會指向當前使用的 slab page 第一個空閒的 object,他們會被串成一條 linklist `page` 則會指向當前 CPU 使用的 slab page `slab` 指向當前 CPU 使用的 slab `partial` 指向當前 CPU 中尚未被使用完的 slab 跟 slab page 如果是已經被分配完的就不會留在這個鏈表中,因為他相信 kernel 之外的會自己管理好 PS: 一個 slab 會負責管理一個至多個連續的 pages(不用管這裡有多少個 pages,關注在有很多個相同大小 object) 到了 6.x 應該是不管理 page 了,直接管理 slab 接下來來看另外一條路 他是一個 pointer array,指向 `struct kmem_cache_node` ```c struct kmem_cache_node *node[MAX_NUMNODES]; ``` kmem_cache_node 記錄了該 kmem_cache 下所有的 slab https://elixir.bootlin.com/linux/v6.15-rc5/source/mm/slub.c#L424 ```c /* * The slab lists for all objects. */ struct kmem_cache_node { spinlock_t list_lock; unsigned long nr_partial; struct list_head partial; #ifdef CONFIG_SLUB_DEBUG atomic_long_t nr_slabs; atomic_long_t total_objects; struct list_head full; #endif }; ``` 裡面有 partial (link slab) 或 full slab (debug mode open) 的 linklist,來去管理 slab 基本上到這邊已經了解了整個 slab allocator 架構了 ## slab ```c /* Reuses the bits in struct page */ struct slab { unsigned long __page_flags; struct kmem_cache *slab_cache; union { struct { union { struct list_head slab_list; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next; int slabs; /* Nr of slabs left */ }; #endif }; /* Double-word boundary */ union { struct { void *freelist; /* first free object */ union { unsigned long counters; struct { unsigned inuse:16; unsigned objects:15; /* * If slab debugging is enabled then the * frozen bit can be reused to indicate * that the slab was corrupted */ unsigned frozen:1; }; }; }; #ifdef system_has_freelist_aba freelist_aba_t freelist_counter; #endif }; }; struct rcu_head rcu_head; }; unsigned int __page_type; atomic_t __page_refcount; #ifdef CONFIG_SLAB_OBJ_EXT unsigned long obj_exts; #endif }; ``` 一組包含一個或多個連續 pages 的記憶體,這些頁面包含特定大小的 kernel object kmem_cache_CPU 及 slab freelist - 先從當前 CPU 的 CPU freelist 分配,不需要加鎖(lockless fast path) - 若 CPU freelist 空了,就從 slab 的 freelist 拿一批 object refill(會加鎖) - 先放回本 CPU 的 freelist(若 object 屬於本 CPU 分配的 slab) - 若不是(跨 CPU 釋放),或本 CPU freelist 滿了,就會放回 slab freelist(可能會涉及鎖) ## kmalloc & kmalloc-cg https://github.com/torvalds/linux/commit/494c1dfe855ec1f70f89552fce5eadf4a1717552 ## object 實際分配的單位,會將一個 slab 管理的 page 切成相同大小的 object   ``` gef> x/30xg 0xffff888004ec0090 0xffff888004ec0090: 0x6b6b6b6b6b6b6b6b 0xa56b6b6b6b6b6b6b 0xffff888004ec00a0: 0xbbbbbbbbbbbbbbbb 0xffff888004ec0110 0xffff888004ec00b0: 0x0000000000000000 0x0000000000000000 0xffff888004ec00c0: 0x0000000000000000 0x0000000000000000 0xffff888004ec00d0: 0x0000000000000000 0x0000000000000000 0xffff888004ec00e0: 0x0000000000000000 0x0000000000000000 0xffff888004ec00f0: 0x5a5a5a5a5a5a5a5a 0x5a5a5a5a5a5a5a5a 0xffff888004ec0100: 0xbbbbbbbbbbbbbbbb 0xbbbbbbbbbbbbbbbb 0xffff888004ec0110: 0x6b6b6b6b6b6b6b6b 0xa56b6b6b6b6b6b6b 0xffff888004ec0120: 0xbbbbbbbbbbbbbbbb 0xffff888004ec0190 0xffff888004ec0130: 0x0000000000000000 0x0000000000000000 0xffff888004ec0140: 0x0000000000000000 0x0000000000000000 0xffff888004ec0150: 0x0000000000000000 0x0000000000000000 0xffff888004ec0160: 0x0000000000000000 0x0000000000000000 0xffff888004ec0170: 0x5a5a5a5a5a5a5a5a 0x5a5a5a5a5a5a5a5a ``` http://www.wowotech.net/memory_management/427.html ## 分配 內核主要提供兩個函數來分配記憶體,分別為: kmalloc(memory_size, flags)- 使用通用快取 kmem_cache_alloc(kmem_cache)- 使用指定的快取 ## 釋放 ## trace ``` wget https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.15.7.tar.xz tar -xvf linux-6.15.7.tar.xz make defconfig ``` 先弄一個初始的 config 然後編譯下去 ``` make -j$(nproc) ``` 接下來製作開啟檔案 ``` git clone https://github.com/mirror/busybox.git make defconfig scripts/config --enable DEBUG_INFO_DWARF4 scripts/config --enable DEBUG_KERNEL scripts/config --enable FRAME_POINTER scripts/config --enable GDB_SCRIPTS scripts/config --enable KALLSYMS scripts/config --enable KALLSYMS_ALL scripts/config --enable CONFIG_SLUB scripts/config --enable CONFIG_SLUB_DEBUG scripts/config --enable CONFIG_SLUB_DEBUG_ON scripts/config --enable CONFIG_DEBUG_KERNEL scripts/config --enable CONFIG_SLUB_CPU_PARTIAL scripts/config --disable CONFIG_SLUB_TINY scripts/config --enable CONFIG_SLAB_FREELIST_HARDENED scripts/config --enable CONFIG_SLAB_FREELIST_RANDOM make -j$(nproc) make install mkdir -p proc sys dev etc/init.d ``` add-symbol-file sysfile/trace_slub.ko 0xffffffffc0000000 target remote localhost:1234 source script.gdb https://github.com/Naupjjin/Linux-Kernel-observer-slub-sandbox 整個 sandbox 在這裡 先來 trace 整個結構 ### trace slub struct https://elixir.bootlin.com/linux/v6.15-rc5/source/mm/slab.h#L382 ```c /* The list of all slab caches on the system */ extern struct list_head slab_caches; struct list_head { struct list_head *next, *prev; }; ``` slab caches 是把 kmem_cache 串起來的 header 上面有兩個 pointer  現在只有一個 object  之後拿那塊後,整個 slab 的 object 都已經被分配了,現在他就不再被控制了  當其中一塊被釋放後才會再次拿回來 再分配一塊,就會拿到一整個新的 slab  ```c +-kmem_cache--+ +-kmem_cache--+ +-kmem_cache--+ | cpu_slab |---+ | cpu_slab | | cpu_slab | | flags | | | flags | | flags | | size | | | size | | size | | object_size | | | object_size | | object_size | | offset | | | offset | | offset | +-slab_caches-+ | name | | | name | | name | ...<->| list_head |<->| list_head |<-------->| list_head |<->| list_head |<-> ... +-------------+ | random | | | random | | random | | node[] |-+ | | node[] | | node[] | +-------------+ | | +-------------+ +-------------+ | | +------------------------------------+ | | +----------------------------------+ | | | | +-__per_cpu_offset-+ | +----------| cpu0_offset | | | | cpu1_offset | [active page freelist (fast path)] | | | cpu2_offset | +-chunk---+ +-chunk---+ | | | ... | | ^ | | ^ | | | +------------------+ | |offset | | |offset | | | | v | | v | | | +---------------------------->| next |->| next |->NULL | v | +---------+ +---------+ | +-kmem_cache_cpu-+ | | | freelist |--+ [active page freelist (slow path)] | | page |---->+-page(active)---+ +-chunk---+ +-chunk---+ | | partial |--+ | freelist |----+ | ^ | | ^ | | +----------------+ | | | | | |offset | | |offset | | | +----------------+ | | v | | v | | | +--->| next |->| next |->NULL | | +---------+ +---------+ | | | | [partial page freelist] | +->+-page(partial)--+ +-chunk---+ +-chunk---+ | | freelist |----+ | ^ | | ^ | | | next |--+ | | |offset | | |offset | | +----------------+ | | | v | | v | | | +--->| next |->| next |->NULL | +-----------------+ +---------+ +---------+ | | | v [partial page freelist] | +-page(partial)--+ +-chunk---+ +-chunk---+ | | freelist |----+ | ^ | | ^ | | | next |--+ | | |offset | | |offset | | +----------------+ | | | v | | v | | | +--->| next |->| next |->NULL | +-----------------+ +---------+ +---------+ | | | v +--+ ... | [numa node partial page freelist] v +-page(numa-node)+ +-chunk---+ +-chunk---+ +-kmem_cache_node-+ | freelist |----+ | ^ | | ^ | | partial |---->| next |--+ | | |offset | | |offset | | | +----------------+ | | | v | | v | +-----------------+ | +--->| next |->| next |->NULL | ... | +----------------------+ +---------+ +---------+ | | | +-----------------+ | [numa node partial page freelist] | +-page(numa-node)+ +-chunk---+ +-chunk---+ | | freelist |----+ | ^ | | ^ | +->| next |--+ | | |offset | | |offset | +----------------+ | | | v | | v | | +--->| next |->| next |->NULL +----------------------+ +---------+ +---------+ | v ... ``` https://github.com/bata24/gef/issues/119 https://mp.weixin.qq.com/s?__biz=Mzg2MzU3Mjc3Ng==&mid=2247488095&idx=1&sn=ffc73f33cb03f0be7cf2ab1172fdc448&chksm=ce77d418f9005d0e9d95f223f679198fd2e40377abdcaf2c636962994c3508bcd814f9929937&cur_album_id=2559805446807928833&scene=190#rd
×
Sign in
Email
Password
Forgot password
or
By clicking below, you agree to our
terms of service
.
Sign in via Facebook
Sign in via Twitter
Sign in via GitHub
Sign in via Dropbox
Sign in with Wallet
Wallet (
)
Connect another wallet
New to HackMD?
Sign up