# 2022q1 Homework5 (quiz8) contribute by < `arthurchang09` > > [第 8 週測驗題和延伸問題的討論](https://hackmd.io/@arthur-chang/linux2022-quiz8-ans) ### 將組合語言引入 memchr_opt 下方組合語言可以順利完成 `alignment` 的部分,相當於 `memchr_opt` 第一個 `while` 迴圈, ```c void *func(const void *src_void, int c, size_t length){ const unsigned char *src = (const unsigned char *) src_void; unsigned char d = c; long al; char test; asm volatile( "jmp align\n\t" "loop:\n\t" "length:\n\t" "decq %1\n\t" "testq %1,%1\n\t" "jne equal\n\t" "mov $0,%0\n\t" "jmp end\n\t" "equal:\n\t" "mov (%0),%3\n\t" "cmp %%al,%3\n\t" "je end\n\t" "incq %0\n\t" "align:\n\t" "mov %0,%2\n\t" "andq $7,%2\n\t" "testq %2,%2\n\t" "jne loop\n\t" : "=D" (src), "=&c" (length),"=r"(al),"=r"(test) : "a" (c), "0" (src), "1" (length), "2"(al),"3"(test) : "memory","cc"); asm volatile( "end:\n\t" : "=D" (src), "=&c" (length),"=r"(al),"=r"(test) : "a" (c), "0" (src), "1" (length), "2"(al),"3"(test) : "memory","cc"); return (void *) src; ``` 目前已將 alignment 與 `DETEC_CHAR` 巨集的部分改寫成以下 inline assembly 。 ```c void *fun(const void *src_void, int c, size_t length){ const unsigned char *src = (const unsigned char *) src_void; unsigned char d = c; long al; char test; asm volatile( "jmp align\n\t" "loop:\n\t" "length:\n\t" "decq %1\n\t" "testq %1,%1\n\t" "jne equal\n\t" "mov $0,%0\n\t" "jmp end\n\t" "equal:" "mov (%0),%3\n\t" "cmp %%al,%3\n\t" "je end\n\t" "incq %0\n\t" "align:\n\t" "mov %0,%2\n\t" "testq $7,%2\n\t" "jne loop\n\t" : "=D" (src), "=&c" (length),"=r"(al),"=r"(test) : "a" (c), "0" (src), "1" (length), "2"(al),"3"(test) : "memory","cc"); if (!TOO_SMALL(length)) { unsigned long *asrc = (unsigned long *) src; unsigned long mask = d << 8 | d; mask = mask << 16 | mask; for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1) mask = (mask << i) | mask; unsigned long xor, consta = 0xFEFEFEFEFEFEFEFF,constb = 0x8080808080808080; unsigned long mask4 = mask; asm volatile( "detect_loop:\n\t" "movq %6,%1\n\t" "xorq (%0),%1\n\t" "movq %1,%2\n\t" "addq %4,%1\n\t" "notq %2\n\t" "andq %5,%2\n\t" "andq %1,%2\n\t" "testq %2,%2\n\t" "jne break_detect\n\t" "sub $8,%3\n\t" "add $8,%0\n\t" "cmp $7,%3\n\t" "jae detect_loop\n\t" :"=D"(asrc),"=r"(mask),"=r"(xor),"=r"(length) :"r"(consta),"r"(constb),"r"(mask4),"0"(asrc),"1"(mask),"2"(xor),"3"(length) :"memory"); asm volatile( "break_detect:\n\t" :"=D"(asrc),"=r"(mask),"=r"(xor),"=r"(length) :"r"(consta),"r"(constb),"r"(mask4),"0"(asrc),"1"(mask),"2"(xor),"3"(length) :"memory"); src = (unsigned char *) asrc; } asm volatile("repne\n\t" "scasb\n\t" "je 1f\n\t" "mov $1,%0\n" "1:\tdec %0" : "=D" (src), "=&c" (length) : "a" (c), "0" (src), "1" (length) : "memory"); asm volatile( "end:\n\t" : "=D" (src), "=&c" (length),"=r"(al),"=r"(test) : "a" (c), "0" (src), "1" (length), "2"(al),"3"(test) : "memory","cc"); return (void *) src; } ``` 效能與正確性測試程式如下 ```c int main(){ char str2[1000]; memset(str2,'0', sizeof(char)*1000); str[999] = '\0'; char ch3 = '4'; char *ret, *ret1, *ret3; for (int i = 0; i < 999; ++i) { memset(str2,'0', sizeof(char)*1000); str2[i] = '4'; struct timespec t1,t2,t3,t4,t5,t6; ret = memchr_opt(str2,ch3,strlen(str2)); clock_gettime(CLOCK_REALTIME, &t1); for (int j = 0; j < 100; ++j){ ret = memchr_opt(str2,ch3,strlen(str2)); } clock_gettime(CLOCK_REALTIME, &t2); clock_gettime(CLOCK_REALTIME, &t3); for (int j = 0; j < 100; ++j){ ret1 = fun(str2,ch3,strlen(str2)); } clock_gettime(CLOCK_REALTIME, &t4); clock_gettime(CLOCK_REALTIME, &t5); for (int j = 0; j < 100; ++j){ ret3 = memchr(str2,ch3,strlen(str2)); } clock_gettime(CLOCK_REALTIME, &t6); printf("%d %ld %ld %ld\n", i, (t2.tv_nsec - t1.tv_nsec) / 100, (t4.tv_nsec - t3.tv_nsec) / 100, (t6.tv_nsec - t5.tv_nsec) / 100); if (strlen(ret) != strlen(ret1)){ printf("error\n"); } else { for (int k = 0; k < strlen(ret); ++k){ if (ret[k] != ret1[k]){ printf("error2"); break; } } } } return 0; } ``` 編譯 ``` gcc -o a.out test.c ``` 初步在虛擬機 (CPU i7-10700K) userspace 之效能如下圖 ![](https://i.imgur.com/r6aLkEE.png) 雖然效能不及 `default-memchr` ,但已比 `memchr-opt` 運行速度更快。 #### 在 Linux Kernel Module 測試 預計採用 `fibdrv` 方式寫成 Linux character device driver,使用 `copy_from_user` 將字串從 `userspace` 複製到 kernel 放入 `kmalloc` 要到的記憶體空間,最後將結果使用 `copy_to_user` 回傳到 `userspace` ,並將執行時間用 return 的方式回傳。 ![](https://i.imgur.com/O6UrLAX.png) inline assembly 效能卻較差,試著在優化 將 alignment 的部分稍作修改,省一次 `jmp` 。 ```c asm volatile( "testq $7,%0\n\t" "je end_align\n\t" "loop:\n\t" "length:\n\t" "decq %1\n\t" "testq %1,%1\n\t" "jne equal\n\t" "mov $0,%0\n\t" "jmp end\n\t" "equal:" "mov (%0),%2\n\t" "cmp %%al,%2\n\t" "je end\n\t" "incq %0\n\t" "align:\n\t" "testq $7,%0\n\t" "jne loop\n\t" "end_align:\n\t" : "=D" (src), "=&c" (length),"=r"(test) : "a" (c), "0" (src), "1" (length),"2"(test) : "memory","cc"); ``` ![](https://i.imgur.com/TFkGlHM.png) :::info 考慮到之後會在 LKML 提交圖表,標示方式應該統一: * `glibc-memchr-x86`: glibc 裡頭 sysdep/x86 的 `memchr` 實作 * `linux-memchr-x86`: 原本 Linux 核心的 x86 `memchr` 實作 * `memchr-x86-opt`: 上述針對 Linux 核心 x86 `memchr` 的效能改進實作 ::: 回顧 inline assmebly 程式碼,用 `gcc -S test.c` 觀察組合語言,發現每一段 inline assmebly 前都會將原本 register 的植存回記憶體,我的實作又有數個分段的 inline assembly ,因此最佳的方式只要一塊 inline assembly 。 ``` .L23: cmpl $63, -60(%rbp) jbe .L24 movabsq $-72340172838076673, %rax movq %rax, -32(%rbp) movabsq $-9187201950435737472, %rax movq %rax, -24(%rbp) movq -32(%rbp), %r8 movq -24(%rbp), %r9 movq -48(%rbp), %r10 movq -40(%rbp), %rsi movq -16(%rbp), %rcx movq -8(%rbp), %rdx movq -88(%rbp), %rax movq %rsi, %rdi #APP # 180 "test.c" 1 detect_loop: movq (%rdi),%rcx xorq %r10,%rcx lea (%rcx,%r8), %rdx notq %rcx andq %r9,%rcx testq %rcx,%rdx jne break_detect sub $8,%rax add $8,%rdi cmp $7,%rax jae detect_loop ``` 經過數次對比測試,發現去掉下列部分組合語言: ```c asm volatile("repne\n\t" "scasb\n\t" "je 1f\n\t" "mov $1,%0\n" "1:\tdec %0" : "=D" (src), "=&c" (length) : "a" (c), "0" (src), "1" (length) : "memory"); ``` 去掉開頭 alignment 的組合語言在不開啟編譯器優化下效能沒差。但先去掉以節省將 register 存回 memory 的操作。 在虛擬機中 Linux 核心之效能表現不錯。和 `glibc-memchr-x86` 實作比較上,效能也有些許勝出,如下圖所示: ![](https://i.imgur.com/Djk8ew2.png) 以上測試皆在虛擬機上,轉移到實體電腦測試(CPU i5-7200U)。 ![](https://i.imgur.com/m7mMBNU.png) :::warning 參照 [std::find() and memchr() Optimizations](https://gms.tf/stdfind-and-memchr-optimizations.html) 一文,使用其效能分析的手法,對上述 `memchr-x86-opt` 和相關實作進行評比。 :notes: jserv ::: ### 參考 [std::find() and memchr() Optimizations](https://gms.tf/stdfind-and-memchr-optimizations.html) 的效能分析 接著參考 [std::find() and memchr() Optimizations](https://gms.tf/stdfind-and-memchr-optimizations.html) , 其採用 128 `KiB` 的 `char` buffer 儲存字串。會選擇這個大小是因為會有更好的 throughput ,參見 [GNU 註解](http://git.savannah.gnu.org/gitweb/?p=coreutils.git;a=blob;f=src/ioblksize.h;h=266c209f48fc07cb4527139a2548b6398b75f740;hb=HEAD#l23) ,我先在虛擬機是跑裡面的 script ``` for i in $(seq 0 10); do bs=$((1024*2**$i)) printf "%7s=" $bs timeout --foreground -sINT 2 \ dd bs=$bs if=/dev/zero of=/dev/null 2>&1 \ | sed -n 's/.* \([0-9.]* [GM]B\/s\)/\1/p' done ``` 得到以下輸出: ``` bash throughput.sh 1024=3.6 GB/s 2048=6.5 GB/s 4096=11.0 GB/s 8192=16.2 GB/s 16384=21.4 GB/s 32768=22.1 GB/s 65536=26.1 GB/s 131072=28.0 GB/s 262144=27.3 GB/s 524288=27.1 GB/s 1048576=28.3 GB/s ``` 筆電雙系統 ``` 1024=916 MB/s 2048=1.8 GB/s 4096=3.3 GB/s 8192=5.7 GB/s 16384=9.0 GB/s 32768=11.5 GB/s 65536=15.0 GB/s 131072=17.0 GB/s 262144=17.4 GB/s 524288=17.8 GB/s 1048576=18.0 GB/s ``` 每次執行執會稍有不同,但可看出大約在 $131072 = 128\times 1024 KiB$ 速度就達到接近頂峰,因此選擇這個大小。 此文作者程式碼主要是測試較為破碎的字串,輸入資料擷取部分如下: ```= W,w,WW,WWW,WY|Y,y,A a,AA,AAA,aah,ah|AI,air,AR Ar,Au,aw,E,e|ea,ear,EEO e'er,eh,EOE,ER,Er|er,EU,Eu Eur,I,i,IA,Ia|IE,IEEE,ii iii,Io,IOU,Ir,O|o,oar,OE o'er,OH,oh,oi,ooh|OR,or,our ow,U,u,UAR,UAW|ugh,uh,Ur ``` 測試程式會用一次 `find()` ,和 `memchr` 功能相近的程式,去找每一行的 `\n` 的位址,再用一次 `find()` 找到 `|` 。以上方的資料為例,第一次尋找會找到 `W,w,WW,WWW,WY|Y,y,A` ,第二次會找到 `|Y,y,A` 。接著移向下一行再尋找,直到 buffer ,即暫存所有資料的 array 走到盡頭,接著會再繼續從檔案讀取下一波 `131072` byte 資料。因此,估計在不開優化的情形下 `linux-memchr-x86` 的表現不會太差。 以下為改寫成 C 之測試程式碼 ::: spoiler ```c= int main() { char buffer[128*1024 + 64]; int N = 128 * 1024; int n = 3000; FILE* fd = fopen("in", "r"); size_t r, off = 0; char *e, *x = buffer, *y; const char *b1 = buffer + N; for (int i = 0; i < n; ++i) { fseek(fd, 0, SEEK_SET); while (1) { off = 0; r = fread(buffer, sizeof(char), N, fd); char *b = buffer; for (;;) { x = memchr_x86(b, '\n', b1 - x); if (!x) { break; } y = memchr_x86(b, '|', x - b); *x = '\0'; off += y - b; if (y != x) { printf("%ld\n", off); } if (x >= b1) { break; } b = x + 1; off = 0; } if (r != N) { break; } } } } ``` ::: 採用 command line 的 `time` 測量整個程式的執行時間,不開啟編譯器優化 ``` time taskset 0x01 ./b.out ``` | | memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |:---------:|:--------------:|:----------------:|:----------------:| | real time | 11.33825 | 11.72425 | 13.076 | | user time | 11.0325 | 11.406 | 12.726 | | sys time | 0.2935 | 0.32425 | 0.344 | 由上方表格可看出在短字串下 `linux-memchr-x86` 表現比 `glibc-memchr-x86` 好 接下來測試開啟編譯器優化,由於先前 inline assembly 的 label 是採用文字,造成開啟 `-O3` 優化時會出現錯誤,因此在此修改 label ,改用數字表示,且在 `jump` instruction 後的 label 要加上 `f` 或 `b` 表示會往下方或上方的 label 進行 jump ,使編譯器 unroll 時不會發生錯誤,如下: ```c void *fun1(const void *src_void, int c, size_t length){ const unsigned char *src = (const unsigned char *) src_void; unsigned char d = c; long al; char test; while (UNALIGNED(src)) { if (!length--) return NULL; if (*src == d) return (void *) src; src++; } if (!TOO_SMALL(length)) { unsigned long *asrc = (unsigned long *) src; unsigned long mask = d << 8 | d; mask = mask << 16 | mask; for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1) mask = (mask << i) | mask; unsigned long xor, consta = 0xFEFEFEFEFEFEFEFF,constb = 0x8080808080808080; unsigned long data; asm volatile( "1:\n\t" "movq (%0),%1\n\t" "xorq %6,%1\n\t" "lea (%1,%4), %2\n\t" "notq %1\n\t" "andq %5,%1\n\t" "testq %1,%2\n\t" "jne 2f\n\t" "sub $8,%3\n\t" "add $8,%0\n\t" "cmp $7,%3\n\t" "jae 1b\n\t" "2:\n\t" :"=D"(asrc),"=r"(data),"=r"(xor),"=r"(length) :"r"(consta),"r"(constb),"r"(mask),"0"(asrc),"1"(data),"2"(xor),"3"(length) :"memory"); src = (unsigned char *) asrc; } while (length--) { if (*src == d){ return (void *) src; } src++; } return (void *) NULL; } ``` | | memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |:---------:|:--------------:|:----------------:|:----------------:| | real time | 8.34025 | 11.1175 | 8.4965 | | user time | 8.01525 | 10.82 | 8.23925 | | sys time | 0.312 | 0.291 | 0.257 | 開啟 `-O3` 後, 三者的速度都有所加快。其中 `glibc-memchr-x86` 之速度加快很多,但仍較 `memchr-x86-opt` 慢。 接著回去看組合語言,兩者皆是將函式直接放入 `int main()` 中,因此 funtion call 的成本兩者應該是相近的,其中並未使用 loop unrolling 的方式優化。因此推測是 inline assmebly 的部分帶來的加速效果。因此我使用原本測驗一的程式碼再跑一次測試,並使用 `perf` 比較是否是組合語言達成這個效果,如下: ::: spoiler memchr-x86-opt(assembly) ``` sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null Performance counter stats for 'taskset 0x01 ./b.out' (10 runs): 205,4433,7709 cycles ( +- 0.02% ) (66.63%) 447,8938,8415 instructions # 2.18 insn per cycle ( +- 0.00% ) (83.32%) 282,0384 cache-misses # 1.288 % of all cache refs ( +- 7.92% ) (83.33%) 2,1897,1894 cache-references ( +- 1.35% ) (83.35%) 2,6449,1465 branch-misses ( +- 0.01% ) (83.36%) 96,5328,5976 branch ( +- 0.00% ) (83.33%) 8.25079 +- 0.00243 seconds time elapsed ( +- 0.03% ) ``` memchr-x86-opt(original) ``` sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null Performance counter stats for 'taskset 0x01 ./b.out' (10 runs): 218,3588,7203 cycles ( +- 0.05% ) (66.66%) 448,9144,0365 instructions # 2.06 insn per cycle ( +- 0.00% ) (83.34%) 342,8490 cache-misses # 1.737 % of all cache refs ( +- 8.35% ) (83.34%) 1,9738,5499 cache-references ( +- 0.89% ) (83.34%) 2,5664,0313 branch-misses ( +- 0.02% ) (83.34%) 95,5833,8661 branch ( +- 0.00% ) (83.32%) 8.77154 +- 0.00486 seconds time elapsed ( +- 0.06% ) ``` glibc-memchr-x86 ``` sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null Performance counter stats for 'taskset 0x01 ./b.out' (10 runs): 211,8621,2328 cycles ( +- 0.03% ) (66.64%) 444,6003,3863 instructions # 2.10 insn per cycle ( +- 0.00% ) (83.33%) 335,5993 cache-misses # 1.595 % of all cache refs ( +- 5.27% ) (83.34%) 2,1047,1090 cache-references ( +- 1.85% ) (83.34%) 2,5254,3590 branch-misses ( +- 0.00% ) (83.34%) 97,1084,9600 branch ( +- 0.00% ) (83.34%) 8.50679 +- 0.00295 seconds time elapsed ( +- 0.03% ) ``` linux-memchr-x86 ``` sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null Performance counter stats for 'taskset 0x01 ./b.out' (10 runs): 280,0662,7457 cycles ( +- 0.09% ) (66.65%) 309,2942,6350 instructions # 1.10 insn per cycle ( +- 0.01% ) (83.33%) 1548,9334 cache-misses # 7.536 % of all cache refs ( +- 3.03% ) (83.33%) 2,0553,4812 cache-references ( +- 1.52% ) (83.34%) 72,7369 branch-misses ( +- 0.67% ) (83.34%) 61,5698,8166 branch ( +- 0.01% ) (83.34%) 11.2574 +- 0.0114 seconds time elapsed ( +- 0.10% ) ``` ::: 整理如下: | | memchr-x86-opt(assembly) | memchr-x86-opt(original) | glibc-memchr-x86 | linux-memchr-x86 | |:--------------:|:------------------------:|:------------------------:|:------------------:|:-----------------:| | time(s) | 8.25079 +- 0.00243 | 8.77154 +- 0.00486 | 8.50679 +- 0.00295 | 11.2574 +- 0.0114 | | instructions | 447,8938,8415 | 448,9144,0365 | 444,6003,3863 | 309,2942,6350 | | cache-misses | 282,0384 | 342,8490 | 335,5993 | 1548,9334 | | insn per cycle | 2.18 | 2.06 | 2.10 | 1.10 | 相較於測驗一原始程式,可以看出我修改的 inline assembly 在 IPC(instruction per cycle) 以及 cache-misses 有所領先。 以上的測試主要是針對較短字串,若目標字元在整個字串相對靠後的位置,表現應該會有所變化。接下來我對輸入資料做修改,在 `|` 之前至少有 200 個單字,後面有 1 ~ 50 個單字,每個單字平均長度以 5 個字元計算,包括逗號,至少需比較 1199 個字元。 資料產生的方式為以下 `word_gen.py` ,從 `/usr/share/dict/words` 中隨機選取單詞組合 ```python #!/usr/bin/env python3 import random import string words = open('/usr/share/dict/words').read().splitlines() ranstr = '' f = open('in2.txt', 'w') for i in range(15833): ranstr1 = '' lengh = random.randint(200,400) for j in range(lengh): ranstr += words[random.randint(0,len(words) - 1)] if (j < lengh - 1): ranstr += ',' ranstr += '|' lengh = random.randint(1,50) for j in range(lengh): ranstr += words[random.randint(0,len(words) - 1)] if (j < lengh - 1): ranstr += ',' print(ranstr, file = f) ranstr = '' f.close() ``` 範例字串 ``` dawns,gratitude's,Houdini,barnyard,OHSA's,Florence's,rehabilitates,foulest,conquistadores,divide's,insert's,ramifications ``` 開啟 `-O3` 優化 結果 | | memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |:---------:|:--------------:|:----------------:|:----------------:| | real time | 1m 5.57875 | 4m 20.38075 | 1m 16.8215 | | user time | 45.76925 | 3m 57.2797 | 55.55025 | | sys time | 19.73975 | 22.98525 | 21.2315 | 未開啟優化 : | | memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |:---------:|:--------------:|:----------------:|:----------------:| | real time | 1m 30.063 | 4m 20.9525 | 2m 28.592 | | user time | 1m 8.37275 | 3m 57.7035 | 2m 6.033 | | sys time | 21.73575 | 23.12075 | 22.484 | 由以上表格可看出在不開優化下 ` linux-memchr-x86` 在長字串的搜尋就沒有比 `glibc-memchr-x86` 快。 由於 `time` 這個命令可能會將 ELF loader 和 VMA 的成本都算進去,因此改用 `struct timespec` 來測量時間這 3000 次 iteration 總共花的時間。 結果如下: 開啟 `-O3` | | memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |:-------:|:--------------:|:----------------:|:----------------:| | 時間(s) | 65.403 | 259.202 | 65.284 | 未開啟優化 | | memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |:-------:|:--------------:|:----------------:|:----------------:| | 時間(s) | 89.296 | 259.409 | 147.488 | 意外的是開啟 `-O3` 時 `glibc-memchr-x86` 這次的測試成績居然較 `memchr-x86-opt` 佳,大約可以快約 0.1 ~ 0.8 秒,因此我用同樣的方式重新測量了一次時間,結果 `glibc-memchr-x86` 時間又回到之前採用 `time` 命令的長度,反而 `memchr-x86-opt` 的執行時間沒有太大改變,維持 65 秒上下,不太清楚為何有這 10 秒的差距。 | | memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |:-------:|:--------------:|:----------------:|:----------------:| | 時間(s) | 65.398 | 259.214 | 76.24 | 接著我在程式碼不同的位置放上 `clock_gettime` 測量不同區段的執行時間。首先,放在上方程式碼 17 行 `for` 迴圈前後,將每次跑迴圈的時間累加起來 。接著是在 `while` 迴圈前後,將每次檔案處理完的時間加起來,比較這其中的差距。 逐次 `for` 迴圈時間總和 | | memchr-x86-opt | glibc-memchr-x86 | |:-------:|:--------------:|:----------------:| | 時間(s) | 45.5391 | 44.93 | 逐次 `while` 迴圈時間總和 | | memchr-x86-opt | glibc-memchr-x86 | |:-------:|:--------------:|:----------------:| | 時間(s) | 69.74045 | 76.209 | 明顯觀察到 `for` 迴圈的執行時間 `glibc-memchr-x86` 較快,而 `while` 迴圈中則變慢,且 `while` 迴圈的測量數據中, `memchr-x86-opt` 竟然較測量 3000 次 iteration 長,明顯測量上有問題。 接著我測量跑完單一檔案所花的時間,也就是 `while` 迴圈前後的位置放上 `clock_gettime` ,但只執行一次就跳出最外層的 `for` 迴圈,結果如下: | | memchr-x86-opt | glibc-memchr-x86 | |:-------:|:--------------:|:----------------:| | 時間(s) | 0.02173 | 0.02484 | 上方表格的數字乘 3000 就和使用 `time` 以及測量 3000 iteration 的時間相近, #### 重新設定 isolcpus 在進行效能測試 跟老師討論後,認為時間的差距有可能是因為 CPU0 存在一些特定的裝置驅動或 kthread,造成測量的時間有誤,因此重新限定 CPU 給特定的程式使用。我的 CPU 為 2 核 4 緒,因此將 CPU2 和 CPU 3 獨立出來。 進入 /etc/default/grub ``` sudo vim /etc/default/grub ``` 看到以下這行 ``` GRUB_CMDLINE_LINUX_DEFAULT="quiet splash" ``` 改成 ``` GRUB_CMDLINE_LINUX_DEFAULT="quiet splash isolcpus=2,3" ``` #### 關閉 Hyper Thread 在筆電的 BIOS 翻了一圈並沒有相關選項,因此無法關閉。推測可能是因為筆電的 BIOS 限制較嚴格,加上我的筆電 CPU 其實只有兩個實體核,故無法關閉。 #### 結果 為了去除可能的誤差,我按照老師的提示,運用統計手法去除極端值,並開啟 `-O2` 優化。 首先,測量一次檔案的時間,總共取 1000 次,並且取 95% 信賴區間。 | | memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |:-----------:|:---------------:|:----------------:|:----------------:| | 時間範圍(s) | 0.02190 ~ 0.02201 | 0.08536 ~ 0.08713 | 0.02256 ~0.02275 | 可看出在 `-O2` 優化下, `memchr-x86-opt` 和 `glibc-memchr-x86` 效能是相差無幾,兩者的核心概念是相同,只是實作方式稍有不同。我所新增的組合語言也是基於 `-O3` 的組合語言改寫,因此效能差不多是在意料之中。 相較於 `linux-memchr-x86` , `memchr-x86-opt` 的執行時間是 前者的 $\frac{1}{4}$ ,可見 SWAR 、 word-wide comparing 是十分有效的方法。 #### 以 UML/x86-64 觀察 memchr 的機械碼 UML 之核心版本為 5.15,`memchr` 反組譯的結果如下: ```c $ objdump -xd linux | grep -A12 "<memchr>" 000000006020a91e <memchr>: 6020a91e: f3 0f 1e fa endbr64 6020a922: 48 89 f8 mov %rdi,%rax 6020a925: 48 01 fa add %rdi,%rdx 6020a928: 48 39 d0 cmp %rdx,%rax 6020a92b: 74 0e je 6020a93b <memchr+0x1d> 6020a92d: 48 8d 48 01 lea 0x1(%rax),%rcx 6020a931: 40 38 30 cmp %sil,(%rax) 6020a934: 74 07 je 6020a93d <memchr+0x1f> 6020a936: 48 89 c8 mov %rcx,%rax 6020a939: eb ed jmp 6020a928 <memchr+0xa> 6020a93b: 31 c0 xor %eax,%eax 6020a93d: c3 retq ``` 簡單觀察組合語言, `lea` 將資料讀入 register , `cmp` 將讀入的資料和字元做比較,其中 `%sil` 是一個 8-bits register ,若相同,則 Zero Flag 設起,會跳到 `req` 回傳值。因此這部分的組合語言為 byte-wise comparing 。 #### memchr-opt-x86 之組合語言微調 以下組合語言輸出命令如下 `gcc -S asm.c` `gcc -S -O2 asm.c` 並使用 `diff` 命令比較組合語言差異 * 移除 asrc 然後檢查組合輸出 不開啟優化時,組合語言是有些微區別。比較後發現主要的差別是進行 function call 時,從記憶體取值的位址有差異。最大不同處位於 ` if (!TOO_SMALL(length))` 中的部分,多了 ```c movq -56(%rbp), %rax movq %rax, -40(%rbp) ``` 也就是 `unsigned long *asrc = (unsigned long *) src;` 這段程式碼的組合語言。 但開啟 `-O2` 優化後,這部分被優化掉,有沒有 上述這段程式碼不影響組合語言。 * 試著 consta 和 consta 宣告時加 const 不開啟優化時,兩者組合語言沒有差別,開啟 `-O2` 優化時也沒有差異。 ## 對比 memchr 之前有哪些開發者變更 我直接 `git clone` Linux 在 Github 上的專案。 `git blame arch/x86/lib/string_32.c` ``` b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 180) #ifdef __HAVE_ARCH_MEMCHR 8cf36d2bc5832 arch/x86/lib/string_32.c (Paolo Ciarrocchi 2008-02-19 23:09:59 +0100 181) void *memchr(const void *cs, int c, size_t count) b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 182) { b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 183) int d0; b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 184) void *res; b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 185) if (!count) b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 186) return NULL; 8cf36d2bc5832 arch/x86/lib/string_32.c (Paolo Ciarrocchi 2008-02-19 23:09:59 +0100 187) asm volatile("repne\n\t" b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 188) "scasb\n\t" b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 189) "je 1f\n\t" b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 190) "movl $1,%0\n" b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 191) "1:\tdecl %0" 3492cdf0176bd arch/x86/lib/string_32.c (Paolo Ciarrocchi 2008-08-02 21:25:13 +0200 192) : "=D" (res), "=&c" (d0) 3492cdf0176bd arch/x86/lib/string_32.c (Paolo Ciarrocchi 2008-08-02 21:25:13 +0200 193) : "a" (c), "0" (cs), "1" (count) 3492cdf0176bd arch/x86/lib/string_32.c (Paolo Ciarrocchi 2008-08-02 21:25:13 +0200 194) : "memory"); b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 195) return res; b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 196) } b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 197) EXPORT_SYMBOL(memchr); b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 198) #endif ``` memchr 基本上由 Andi Kleen 和 Paolo Ciarrocchi 變更 `memchr` 這段程式碼,主要內容還是由 Andi Kleen 撰寫。 :::info `memchr` 的改進描述,可參見 ==[arm64: Better optimised memchr()](https://lore.kernel.org/r/58471b42f9287e039dafa9e5e7035077152438fd.1622128527.git.robin.murphy@arm.com)== > Although we implement our own assembly version of `memchr()`, it turns out to be barely any better than what GCC can generate for the generic C version (and would go wrong if the `size_t` argument were ever large enough to be interpreted as negative). Unfortunately we can't import the tuned implementation from the Arm [optimized-routines](https://github.com/ARM-software/optimized-routines) library, since that has some Advanced SIMD parts which are not really viable for general kernel library code. What we can do, however, is pep things up with some relatively straightforward word-at-a-time logic for larger calls. > > Adding some timing to optimized-routines' `memchr()` test for a simple benchmark, overall this version comes in around half as fast as the SIMD code, but still nearly 4x faster than our existing implementation. commit 9e51cafd783b22018fb15bfb06d65f69349223a9 ::: ### 撰寫 git commit message (草稿) 思考中... The original assembly version of `memchr()` is implemented with the byte-wise comparing technic, which is not fully using whole 64-bits registers in x86-64 CPU. We use word-wide comparing so that 8 characters can be compared at the same time on x86-64 CPU. First we align the input and then use word-wise comparing to find the first 64-bit word that content the target. Secondly, we compare every byte in the word and get the output. We create two files to measure the performance. The first file contains on average 10 characters ahead the target character. The second file contains at least 1000 characters ahead the target character. It turns out that "memchr()" is slightly better in the first test and nearly 4x faster than the orginal implementation in the second test. ## string.h 的結構以及 `memchr-opt-x86` 可能的添加方式 以下節錄 `/include/linux/string.h` 以及 `/lib/string.c` ```c #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif ``` ```c #ifndef __HAVE_ARCH_MEMCHR /** * memchr - Find a character in an area of memory. * @s: The memory area * @c: The byte to search for * @n: The size of the area. * * returns the address of the first occurrence of @c, or %NULL * if @c is not found */ void *memchr(const void *s, int c, size_t n) { const unsigned char *p = s; while (n-- != 0) { if ((unsigned char)c == *p++) { return (void *)(p - 1); } } return NULL; } EXPORT_SYMBOL(memchr); #endif ``` 可看到若無 `def __HAVE_ARCH_MEMCHR` ,則會使用 `string.h` 中的 `memchr`。 追溯 `__HAVE_ARCH_MEMCHR` ,發現在 `arch` 目錄下各個架構 CPU ,如 arm 、 arm64 、 powerpc 、 x86 等的 `string.h` 都有,其中 x86 的部分只有在 `string_32.h` 、 `string_32.c` 中有 define。 接著觀察 x86 這兩個檔案,擷取如下: ```c #define __HAVE_ARCH_MEMCHR extern void *memchr(const void *cs, int c, size_t count); ``` ```c #ifdef __HAVE_ARCH_MEMCHR void *memchr(const void *cs, int c, size_t count) { int d0; void *res; if (!count) return NULL; asm volatile("repne\n\t" "scasb\n\t" "je 1f\n\t" "movl $1,%0\n" "1:\tdecl %0" : "=D" (res), "=&c" (d0) : "a" (c), "0" (cs), "1" (count) : "memory"); return res; } EXPORT_SYMBOL(memchr); #endif ``` `string_32.h` 中 define `__HAVE_ARCH_MEMCHR` ,並在 `string_32.c` 中判斷是否已 define ,並且實作程式碼。 接著再看 `string_64.h` ,其內容也類似上述描述,都是 `#define __HAVE_ARCH_XXXX` 接著 function 的宣告,只是沒有統一的 `string_64.c` ,而是各個 `.s` file 的組合語言檔案。因此,`memchr-opt-x86` 要加入且暫時不管 `x86-32` 的話,應該是在 `string_64.h` 做出類似前述的 `#defin` ,並且在 `/arch/x86/lib` 建立一個對應的 `.c` 檔。 之後會嘗試在 UML 上修改並測試。 ### 嘗試在 string_64.h 增加 memchr 增加下面這兩行 ```c #define __HAVE_ARCH_MEMCHR extern void *memchr(const void *cs, int c, size_t count); ``` 並將我的 `memchr-opt-x86` 新增成 `string_64.c`,如下: :::spoiler ```c #define __NO_FORTIFY #include <linux/string.h> #include <linux/export.h> /* Nonzero if either X or Y is not aligned on a "long" boundary */ #define UNALIGNED(X) ((long) X & (sizeof(long) - 1)) /* How many bytes are loaded each iteration of the word copy loop */ #define LBLOCKSIZE (sizeof(long)) /* Threshhold for punting to the bytewise iterator */ #define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE) #ifdef __HAVE_ARCH_MEMCHR void *memchr(const void *src_void, int c, size_t length){ const unsigned char *src = (const unsigned char *) src_void; unsigned char d = c; while (UNALIGNED(src)) { if (!length--) return NULL; if (*src == d) return (void *) src; src++; } if (!TOO_SMALL(length)) { //unsigned long *asrc = (unsigned long *) src; unsigned long mask = d << 8 | d; unsigned int i = 32; mask = mask << 16 | mask; for (i = 32; i < LBLOCKSIZE * 8; i <<= 1) mask = (mask << i) | mask; unsigned long xor, consta = 0xFEFEFEFEFEFEFEFF,constb = 0x8080808080808080; unsigned long data; asm volatile( "1:\n\t" "movq (%0),%1\n\t" "xorq %6,%1\n\t" "lea (%1,%4), %2\n\t" "notq %1\n\t" "andq %5,%1\n\t" "testq %1,%2\n\t" "jne 2f\n\t" "sub $8,%3\n\t" "add $8,%0\n\t" "cmp $7,%3\n\t" "jae 1b\n\t" "2:\n\t" :"=D"(src),"=r"(data),"=r"(xor),"=r"(length) :"r"(consta),"r"(constb),"r"(mask),"0"(src),"1"(data),"2"(xor),"3"(length) :"memory"); //src = (unsigned char *) asrc; } while (length--) { if (*src == d){ return (void *) src; } src++; } return (void *) NULL; } EXPORT_SYMBOL(memchr); #endif ``` ::: compile 成 UML 並使用命令 ` objdump -xd linux | grep -A12 "<memchr>"` 卻沒有找到 `memchr` 之組合語言,使用命令 ` objdump -xd linux | grep memchr` ,部分輸出如下: ``` 0000000060236a09 l F .text 00000000000000bc memchr_inv 000000006001ca80 F *UND* 0000000000000000 memchr@@GLIBC_2.2.5 000000006001ca80 <memchr@plt>: 6001ca80: ff 25 72 37 55 00 jmpq *0x553772(%rip) # 605701f8 <memchr@GLIBC_2.2.5> ``` 顯示成功移除掉原本的實作,但我的程式並沒有成功放入。推測是沒有進行編譯並放入核心,因此要需要對 `Makefile` 做修改。 `arch/x86/lib` ```diff ... ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o lib-y += atomic64_cx8_32.o lib-y += checksum_32.o lib-y += strstr_32.o lib-y += string_32.o ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += cmpxchg8b_emu.o atomic64_386_32.o endif lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else obj-y += iomap_copy_64.o lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o + lib-y += string_64.o lib-y += cmpxchg16b_emu.o endif ``` 增加此行後,按照前述反組譯仍沒有出現,我繼續尋找解法。我追蹤 `string_32.o` ,發現在 `arch/x86/um/Makefile` 中有一段如下 ```=18 ifeq ($(CONFIG_X86_32),y) obj-y += checksum_32.o syscalls_32.o obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o subarch-y += ../kernel/sys_ia32.o else obj-y += syscalls_64.o vdso/ subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o endif ``` 在第 23 行有 `../lib/string_32.o` ,其所在的 `ifeq` 是判斷 CPU 是否為 32 bits ,則下面的 else 必為 64 bits ,因此我在第 31 行最後面增加 `../lib/string_64.o` ,再編譯一次 UML 。 首先輸入命令 `objdump -xd linux | grep memchr`,擷取部分輸出如下: ```c 0000000060236a63 l F .text 00000000000000bc memchr_inv 0000000060036cde l F .text 00000000000000aa memchr ``` 接著 `objdump -xd linux | grep -A53 "<memchr>"` ```c 0000000060036cde <memchr>: 60036cde: f3 0f 1e fa endbr64 60036ce2: 41 89 f0 mov %esi,%r8d 60036ce5: 40 f6 c7 07 test $0x7,%dil 60036ce9: 74 1d je 60036d08 <memchr+0x2a> 60036ceb: 48 85 d2 test %rdx,%rdx 60036cee: 75 07 jne 60036cf7 <memchr+0x19> 60036cf0: 31 ff xor %edi,%edi 60036cf2: e9 8d 00 00 00 jmpq 60036d84 <memchr+0xa6> 60036cf7: 48 ff ca dec %rdx 60036cfa: 44 38 07 cmp %r8b,(%rdi) 60036cfd: 0f 84 81 00 00 00 je 60036d84 <memchr+0xa6> 60036d03: 48 ff c7 inc %rdi 60036d06: eb dd jmp 60036ce5 <memchr+0x7> 60036d08: 48 83 fa 07 cmp $0x7,%rdx 60036d0c: 76 60 jbe 60036d6e <memchr+0x90> 60036d0e: 49 b9 ff fe fe fe fe movabs $0xfefefefefefefeff,%r9 60036d15: fe fe fe 60036d18: 89 f1 mov %esi,%ecx 60036d1a: 40 0f b6 f6 movzbl %sil,%esi 60036d1e: 31 c0 xor %eax,%eax 60036d20: 49 ba 80 80 80 80 80 movabs $0x8080808080808080,%r10 60036d27: 80 80 80 60036d2a: c1 e1 08 shl $0x8,%ecx 60036d2d: 0f b7 c9 movzwl %cx,%ecx 60036d30: 09 f1 or %esi,%ecx 60036d32: 48 63 c9 movslq %ecx,%rcx 60036d35: 48 89 ce mov %rcx,%rsi 60036d38: 48 c1 e6 10 shl $0x10,%rsi 60036d3c: 48 09 ce or %rcx,%rsi 60036d3f: 48 89 f1 mov %rsi,%rcx 60036d42: 48 c1 e1 20 shl $0x20,%rcx 60036d46: 48 09 f1 or %rsi,%rcx 60036d49: 31 f6 xor %esi,%esi 60036d4b: 48 8b 07 mov (%rdi),%rax 60036d4e: 48 31 c8 xor %rcx,%rax 60036d51: 4a 8d 34 08 lea (%rax,%r9,1),%rsi 60036d55: 48 f7 d0 not %rax 60036d58: 4c 21 d0 and %r10,%rax 60036d5b: 48 85 c6 test %rax,%rsi 60036d5e: 75 0e jne 60036d6e <memchr+0x90> 60036d60: 48 83 ea 08 sub $0x8,%rdx 60036d64: 48 83 c7 08 add $0x8,%rdi 60036d68: 48 83 fa 07 cmp $0x7,%rdx 60036d6c: 73 dd jae 60036d4b <memchr+0x6d> 60036d6e: 48 01 fa add %rdi,%rdx 60036d71: 48 39 fa cmp %rdi,%rdx 60036d74: 0f 84 76 ff ff ff je 60036cf0 <memchr+0x12> 60036d7a: 44 38 07 cmp %r8b,(%rdi) 60036d7d: 74 05 je 60036d84 <memchr+0xa6> 60036d7f: 48 ff c7 inc %rdi 60036d82: eb ed jmp 60036d71 <memchr+0x93> 60036d84: 48 89 f8 mov %rdi,%rax 60036d87: c3 retq ``` 注意到 `60036d4b` 到 `60036d6c` 是我寫的組合語言,至此我的程式成功進入 UML 中。 #### `unsigned long` 或 `long` 的 macro 執行 `grep -r "sizeof(unsigned long)" *` 或 `grep -r "sizeof(long)" *` 找到以下 ```c arch/x86/crypto/crc32c-intel_glue.c:#define SCALE_F sizeof(unsigned long) ``` ### Linux 核心之 unaligned / align macro 在尋找 macro 前,發現 kernel 中有些程式碼直接實作 unalign 而非使用 macro ,如 `/lib/string.c` 中的 `strscpy` ```c ssize_t strscpy(char *dest, const char *src, size_t count) { ... #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS /* * If src is unaligned, don't cross a page boundary, * since we don't know if the next page is mapped. */ if ((long)src & (sizeof(long) - 1)) { size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1)); if (limit < max) max = limit; } #else /* If src or dest is unaligned, don't do word-at-a-time. */ if (((long) dest | (long) src) & (sizeof(long) - 1)) max = 0; #endif ... } ``` 在 `include/linux/align.h` 有 alignment 的 macro 。 ```c #define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) ``` 對比原本的 `unalign` ```c #define UNALIGNED(X) ((long) X & (sizeof(long) - 1)) ``` 可發現實作方式基本一樣,只是多個 `== 0` ,因此可以直接使用。 改成如下 ```c while (!IS_ALIGNED((long)src, sizeof(long))) { if (!length--) return NULL; if (*src == d) return (void *) src; src++; } ``` ## 在 UML 測試 memchr 參考 [建構 User-Mode Linux 的實驗環境](https://hackmd.io/@sysprog/user-mode-linux-env) 編譯完 `linux` 執行檔,進行 UML 的設定時,執行 `UML.sh`,其內容如下 ``` #!/bin/sh ./linux umid=uml0 \ root=/dev/root rootfstype=hostfs hostfs=./rootfs \ rw mem=64M init=/bin/sh quiet ``` 出現以下錯誤 ``` Aborted (core dumped) ``` 於是決定直接執行 `./linux` 並比較 原版 ``` Core dump limits : soft - 0 hard - NONE Checking that ptrace can change system call numbers...OK Checking syscall emulation patch for ptrace...OK Checking advanced syscall emulation patch for ptrace...OK Checking environment variables for a tempdir...none found Checking if /dev/shm is on tmpfs...OK Checking PROT_EXEC mmap in /dev/shm...OK Adding 23236608 bytes to physical memory to account for exec-shield gap Linux version 5.15.0 (hihihi@hihihi-VirtualBox) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #1 Sun May 8 15:57:10 CST 2022 Zone ranges: Normal [mem 0x0000000000000000-0x0000000063628fff] Movable zone start for each node Early memory node ranges node 0: [mem 0x0000000000000000-0x0000000003628fff] Initmem setup node 0 [mem 0x0000000000000000-0x0000000003628fff] Built 1 zonelists, mobility grouping on. Total pages: 13675 Kernel command line: root=98:0 console=tty Dentry cache hash table entries: 8192 (order: 4, 65536 bytes, linear) Inode-cache hash table entries: 4096 (order: 3, 32768 bytes, linear) mem auto-init: stack:off, heap alloc:off, heap free:off Memory: 26104K/55460K available (3379K kernel code, 925K rwdata, 1100K rodata, 144K init, 164K bss, 29356K reserved, 0K cma-reserved) ... ``` 增加 `string_64.c` 版本 ``` Core dump limits : soft - 0 hard - NONE Checking that ptrace can change system call numbers...OK Checking syscall emulation patch for ptrace...OK Checking advanced syscall emulation patch for ptrace...OK Checking environment variables for a tempdir...none found Checking if /dev/shm is on tmpfs...OK Checking PROT_EXEC mmap in /dev/shm...OK Adding 1441792 bytes to physical memory to account for exec-shield gap Aborted (core dumped) ``` 似乎是要印出 `Linux version 5.15.0 ...` 時出現 `Aborted (core dumped)` ,經過一番查找,發現是我自己寫的 `memchr` 中組合語言有問題,用原本測驗題的程式碼不會 `Aborted (core dumped)`,經過測試,發現是迴圈的條件判斷出問題,即這段指令 `jae 1b\n\t` ,修改成 `ja 1b\n\t` 即沒有問題。 在 Userspace 進行正確性驗證,將測試字串放入 `memchr-opt-x86` 及 glibc 採用 SSE2 指令的 `memchr` 並比較字串內容,測試到字串長度為 10 萬皆相同。 修改後的結果如下: 5/20 加入初始版註解 :::spoiler ```c // SPDX-License-Identifier: GPL-2.0 #include <linux/string.h> #include <linux/export.h> #include <linux/align.h> /* How many bytes are loaded each iteration of the word copy loop */ #define LBLOCKSIZE (sizeof(long)) #ifdef __HAVE_ARCH_MEMCHR void *memchr(const void *cs, int c, size_t length) { const unsigned char *src = (const unsigned char *)cs; unsigned char d = c; while (!IS_ALIGNED((long)src, sizeof(long))) { if (!length--) return NULL; if (*src == d) return (void *)src; src++; } if (length >= LBLOCKSIZE) { unsigned long mask = d << 8 | d; unsigned int i = 32; long xor, data; const long consta = 0xFEFEFEFEFEFEFEFF, constb = 0x8080808080808080; /* * Create a 8-bytes mask for word-wise comparing. * For example, a mask for 'a' is 0x6161616161616161. */ mask = mask << 16 | mask; for (i = 32; i < LBLOCKSIZE * 8; i <<= 1) mask = (mask << i) | mask; /* * We perform word-wise comparing with following operation: * 1. Xor the long word @src and @mask into @xor. * 2. Add @xor with @consta. * 3. ~@xor & @constb. * 4. Perform & with the result of step 2 and 3. * * Step 1 creates a byte which is 0 in the long word if * there is at least one target byte in it. * * Step 2 to Step 4 find if there is a byte with 0 in * the long word. */ asm volatile("1:\n\t" "movq (%0),%1\n\t" "xorq %6,%1\n\t" "lea (%1,%4), %2\n\t" "notq %1\n\t" "andq %5,%1\n\t" "testq %1,%2\n\t" "jne 2f\n\t" "add $8,%0\n\t" "sub $8,%3\n\t" "cmp $7,%3\n\t" "ja 1b\n\t" "2:\n\t" : "=D"(src), "=r"(xor), "=r"(data), "=r"(length) : "r"(consta), "r"(constb), "r"(mask), "0"(src), "1"(xor), "2"(data), "3"(length) : "memory", "cc"); } while (length--) { if (*src == d) return (void *)src; src++; } return NULL; } EXPORT_SYMBOL(memchr); #endif ``` ::: 準備核心模組時,輸入下列命令 ``` $ make _modinst_ MODLIB=`pwd`/rootfs/lib/modules/VER ARCH=um ``` 出現以下 ``` make: *** No rule to make target '_modinst_'. Stop. ``` 改成下列命令即可。 ``` make modules_install MODLIB=`pwd`/rootfs/lib/modules/VER ARCH=um ``` 在 Linux 核心程式碼最上層建立一個名為 `tests` 的目錄,對 [建構 User-Mode Linux 的實驗環境](https://hackmd.io/@sysprog/user-mode-linux-env#準備核心模組) 範例的 `hello.c` 稍作修改。 `hello.c` 內容: ```c #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/kernel.h> MODULE_LICENSE("GPL"); static int hello_init(void) { char str[1000]; char ch = 'x'; long time; char *ret; ktime_t kt; memset(str, '0', sizeof(char)*1000); str[998] = 'x'; str[999] = '\0'; kt = ktime_get(); ret = memchr(str, ch, strlen(str)); time = ktime_to_ns(ktime_sub(ktime_get(), kt)); printk(KERN_ALERT "Hello World! - init\n"); printk(KERN_ALERT "%s\n", ret); printk(KERN_ALERT "%ld\n", time); return 0; } static void hello_exit(void) { printk(KERN_ALERT "Hello World! - exit\n"); } module_init(hello_init); module_exit(hello_exit); ``` `Makefile` 如下 ``` obj-m += hello.o PWD := $(shell pwd) KDIR := $(PWD)/.. default: $(MAKE) -C $(KDIR) M=$(PWD) modules ARCH=um clean: $(MAKE) -C $(KDIR) M=$(PWD) clean ARCH=um ``` 編譯上述核心模組並複製到 rootfs 中 ``` $ make -C tests $ cp tests/hello.ko rootfs/ ``` 啟動 UML 並掛載 ``` UML:/ # insmod hello.ko ``` 在增加 `string_64.c` 的 UML 輸出結果如下 ``` Hello World! - init x 512 ``` 原始版本如下 ``` Hello World! - init x 768 ``` 第三行的結果為用 `ktime` 得到的時間,因為是 UML ,其值不代表實際時間,但可看出有我的 `memchr` 的確較快,也證實程式碼確實有進入核心。 :::warning 考慮以下讓程式碼更精簡的修改: ```diff @@ -12,8 +12,7 @@ void *memchr(const void *cs, int c, size_t length) { - const unsigned char *src = (const unsigned char *)cs; - unsigned char d = c; + const unsigned char *src = (const unsigned char *)cs, d = c; while (!IS_ALIGNED((long)src, sizeof(long))) { if (!length--) return NULL; @@ -33,9 +32,9 @@ void *memchr(const void *cs, int c, size * For example, a mask for 'a' is 0x6161616161616161. */ - mask = mask << 16 | mask; + mask |= mask << 16; for (i = 32; i < LBLOCKSIZE * 8; i <<= 1) - mask = (mask << i) | mask; + mask |= mask << i; /* * We perform word-wise comparing with following operation: * 1. Xor the long word @src and @mask into @xor. ``` 記得要用 [checkpath](https://docs.kernel.org/dev-tools/checkpatch.html) 檢查,搭配 `–strict` 選項 :notes: jserv ::: ## 將 `memchr()` 放入核心並重新 compile 這是在虛擬機重新編譯核心,劃分的硬碟空間建議只少 50 GB,CPU Core至少分配 4 個。 安裝必要套件 ``` $ sudo apt-get install liblz4-tool $ sudo apt-get install libc6-dev $ sudo apt-get install bison $ sudo apt-get install flex $ sudo apt-get install libelf-dev $ sudo apt-get install libncurses5-dev openssl libssl-dev $ sudo apt-get install xz-utils wget ca-certificates bc $ sudo apt install build-essential libncurses-dev $ sudo apt-get install libzstd-dev $ sudo apt-get install zstd ``` 下載 Linux Kernel 程式碼 ``` $ wget https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.17.9.tar.xz ``` 解壓縮並進入到此目錄 ``` $ cd linux-5.17.9 ``` 複製原本的 `.config` ``` $ cp /boot/5.13.0-41-generic .config ``` 產生 所需 `.config` ``` $ make menuconfig ``` 選擇 `load` 再選擇 `save` 接著執行下列命令或動作避免編譯過程產生錯誤 ``` $ scripts/config --disable SYSTEM_TRUSTED_KEYS $ scripts/config --disable SYSTEM_REVOCATION_KEYS ``` 並將 `.config` 中 `CONFIG_DEBUG_INFO_BTF` 改成如下: ``` CONFIG_DEBUG_INFO_BTF=n ``` 開始編譯, `-j 4` 表示動用四個 CPU Core 加速編譯,此過程耗費時間最長。 ``` $ sudo make -j 4 ``` 跑完會有以下輸出 ``` Kernel: arch/x86/boot/bzImage is ready (#2) ``` 編譯 modules ``` $ sudo make modules -j 4 ``` 安裝 module ``` $ sudo make INSTALL_MOD_STRIP=1 modules_install -j 4 ``` 安裝核心 ``` $ sudo make install $ sudo mkinitramfs /lib/modules/5.17.9/ -o /boot/initrd.img-5.17.9 ``` 更新 grub ``` $ sudo update-grub2 ``` 接著重新開機,從 UML 的安裝可知 `memchr()` 和開機有關,若核心有成功更新成 `5.17.9` 且能成功開機,則更新後的 `memchr()` 應該是沒有大問題。 輸入命令檢查核心版本 ``` $ uname -r ``` 得到以下輸出 ``` 5.17.9 ``` 核心版本成功更新。 接著一樣撰寫 kernel module 觀察 `memchr-opt-x86` 是否真的編譯入核心。 :::spoiler ```c #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/kernel.h> MODULE_LICENSE("GPL"); #define LBLOCKSIZE (sizeof(long)) void *memchr1(const void *cs, int c, size_t length) { const unsigned char *src = (const unsigned char *)cs; unsigned char d = c; while (!IS_ALIGNED((long)src, sizeof(long))) { if (!length--) return NULL; if (*src == d) return (void *)src; src++; } if (length >= LBLOCKSIZE) { unsigned long mask = d << 8 | d; unsigned int i = 32; long xor, data; const long consta = 0xFEFEFEFEFEFEFEFF, constb = 0x8080808080808080; /* * Create a 8-bytes mask for word-wise comparing. * For example, a mask for 'a' is 0x6161616161616161. */ mask = mask << 16 | mask; for (i = 32; i < LBLOCKSIZE * 8; i <<= 1) mask = (mask << i) | mask; /* * We perform word-wise comparing with following operation: * 1. Xor the long word @src and @mask into @xor. * 2. Add @xor with @consta. * 3. ~@xor & @constb. * 4. Perform & with the result of step 2 and 3. * * Step 1 creates a byte which is 0 in the long word if * there is at least one target byte in it. * * Step 2 to Step 4 find if there is a byte with 0 in * the long word. */ asm volatile("1:\n\t" "movq (%0),%1\n\t" "xorq %6,%1\n\t" "lea (%1,%4), %2\n\t" "notq %1\n\t" "andq %5,%1\n\t" "testq %1,%2\n\t" "jne 2f\n\t" "add $8,%0\n\t" "sub $8,%3\n\t" "cmp $7,%3\n\t" "ja 1b\n\t" "2:\n\t" : "=D"(src), "=r"(xor), "=r"(data), "=r"(length) : "r"(consta), "r"(constb), "r"(mask), "0"(src), "1"(xor), "2"(data), "3"(length) : "memory", "cc"); } while (length--) { if (*src == d) return (void *)src; src++; } return NULL; } static int hello_init(void) { char str[1000]; char ch = 'x'; long time,time1; char *ret, *ret2; ktime_t kt,kt2; memset(str, '0', sizeof(char)*1000); str[998] = 'x'; str[999] = '\0'; kt = ktime_get(); ret = memchr1(str, ch, strlen(str)); time = ktime_to_ns(ktime_sub(ktime_get(), kt)); printk(KERN_ALERT "Hello World! - init\n"); printk(KERN_ALERT "%s\n", ret); printk(KERN_ALERT "%ld\n", time); kt2 = ktime_get(); ret2 = memchr1(str, ch, strlen(str)); time1 = ktime_to_ns(ktime_sub(ktime_get(), kt2)); printk(KERN_ALERT "%s\n", ret2); printk(KERN_ALERT "%ld\n", time1); return 0; } static void hello_exit(void) { printk(KERN_ALERT "Hello World! - exit\n"); } module_init(hello_init); module_exit(hello_exit); ``` Makefile ``` obj-m += hello.o PWD := $(shell pwd) KDIR := /lib/modules/$(shell uname -r)/build default: make -C $(KDIR) M=$(PWD) modules clean: make -C $(KDIR) M=$(PWD) clean ``` ::: 執行輸入下列命令將核心模組放入核心 ``` $ sudo insmod hello.ko $ sudo rmmod hello ``` 得到以下結果 ``` [ 1003.758432] Hello World! - init [ 1003.758435] x [ 1003.758436] 458 [ 1003.758437] x [ 1003.758437] 462 [ 1005.920261] Hello World! - exit ``` 可見核心的 memchr 和 `memchr-opt-x86` 執行時間相近,可以說成功放入 Linux Kernel。 :::warning 今天發布 5.18 版,可能要再編譯一次核心並測試 ::: 經過重複上述步驟,在 5.18 Linux kernel 成功開機,利用上述的 kernel module 的結果也相似。 ## 提交 patch ### patch 內容 第一個 commit message 內容 ``` x86/lib: Optimize memchr() The original assembly version of memchr() is implemented with the byte-wise comparing technique, which does not fully use 64-bits registers in x86-64 CPU. We use word-wide comparing so that 8 characters can be compared at the same time on x86-64 CPU. First, we align the input and then use word-wise comparing to find the first 64-bit word that contain the target. Secondly, we compare every byte in the word and get the output. We create two files to measure the performance. The first file contains on average 10 characters ahead the target character. The second file contains at least 1000 characters ahead the target character. Our implementation of “memchr()” is slightly better in the first test and nearly 4x faster than the orginal implementation in the second test. Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com> Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw> ``` 第二個 commit message 內容 ``` x86/um: Use x86_64-optimized memchr Add x86_64-optimized memchr, which is 4x faster than the original implementation, into um. Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com> Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw> ``` 第一個 commit 跑命令 `./scripts/checkpatch.pl -strict` 出現以下訊息 ``` CHECK: extern prototypes should be avoided in .h files #42: FILE: arch/x86/include/asm/string_64.h:18: +extern void *memchr(const void *cs, int c, size_t length); WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? #59: new file mode 100644 ``` `WARNING` 的部分確定是要新增檔案,因此可以不理會。 `CHECK` 的部分建議在 `.h` 中避免 `extern` prototypes ,我翻查 Linux Kernel 中其他 `string.h` 相關的 `memchr()` prototype ,皆是 `extern` prototype 。 按照老師的指示增加了 `cover letter` ``` Subject: [PATCH 0/2] x86: Optimize memchr() for x86-64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit *** BLURB HERE *** These patch series add an optimized "memchr()" for x86-64 and USER-MODE LINUX (UML). There exists an assemebly implementation for x86-32. However, for x86-64, there isn't any optimized version. We implement word-wise comparison so that 8 characters can be compared at the same time on x86-64 CPU. The optimized “memchr()” is nearly 4x faster than the orginal implementation for long strings. We test the optimized “memchr()” in UML and also recompile the 5.18 Kernel with the optimized “memchr()”. They run correctly. In this patch we add a new file "string_64.c", which only contains "memchr()". We can add more optimized string functions in it in the future. Yu-Jen Chang (2): x86/lib: Optimize memchr() x86/um: Use x86_64-optimized memchr arch/x86/include/asm/string_64.h | 3 ++ arch/x86/lib/Makefile | 1 + arch/x86/lib/string_64.c | 78 ++++++++++++++++++++++++++++++++ arch/x86/um/Makefile | 2 +- 4 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 arch/x86/lib/string_64.c -- 2.25.1 ``` #### patch 收件者 第一份 ``` $ ./scripts/get_maintainer.pl -f arch/x86/include/asm/string_64.h Thomas Gleixner <tglx@linutronix.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) Ingo Molnar <mingo@redhat.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) Borislav Petkov <bp@alien8.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) Dave Hansen <dave.hansen@linux.intel.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) x86@kernel.org (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) "H. Peter Anvin" <hpa@zytor.com> (reviewer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) Kees Cook <keescook@chromium.org> (supporter:FORTIFY_SOURCE) linux-kernel@vger.kernel.org (open list:X86 ARCHITECTURE (32-BIT AND 64-BIT)) linux-hardening@vger.kernel.org (open list:FORTIFY_SOURCE) ``` 第二份 ``` $ ./scripts/get_maintainer.pl -f arch/x86/um Richard Weinberger <richard@nod.at> (maintainer:USER-MODE LINUX (UML)) Anton Ivanov <anton.ivanov@cambridgegreys.com> (maintainer:USER-MODE LINUX (UML)) Johannes Berg <johannes@sipsolutions.net> (maintainer:USER-MODE LINUX (UML)) Thomas Gleixner <tglx@linutronix.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) Ingo Molnar <mingo@redhat.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) Borislav Petkov <bp@alien8.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) Dave Hansen <dave.hansen@linux.intel.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) x86@kernel.org (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) "H. Peter Anvin" <hpa@zytor.com> (reviewer:X86 ARCHITECTURE (32-BIT AND 64-BIT)) linux-um@lists.infradead.org (open list:USER-MODE LINUX (UML)) linux-kernel@vger.kernel.org (open list:X86 ARCHITECTURE (32-BIT AND 64-BIT)) ``` 第二份的收件人應該是 Jeff Dike (5/28 更正), email 為 `jdike@linux.intel.com` 。第一份的收件人應為 Andi Kleen 。雖然他不在前述的名單上,之前用 `git blame` 查詢到他是相關檔案的維護者,他的 email 為 `ak@linux.intel.com` 。 #### email 傳送測試 安裝 git-email ``` $ sudo apt-get install git-email ``` 設定 `.gitconfig` ``` [user] name = arthurchang09 email = arthurchang09@gmail.com [sendemail] smtpEncryption = tls smtpServer = smtp.gmail.com smtpServerPort = 587 smtpUser = arthurchang09@gmail.com ``` 由於我的 gmail 有設定雙重驗證,所以要透過 git 送 email 要開啟應用程式密碼,在手機開啟 Google帳戶 -> 安全性 -> 應用程式密碼 ,在此設定密碼。 接著輸入命令 ``` git send-email --to arthurchang09@gmail.com \ --cc arthur09123456@gmail.com \ 0001-x86-lib-Optimize-memchr.patch\ 0002-um-Use-x86_64-optimized-memchr.patch ``` 要輸入密碼時,輸入前面設定的密碼,抑或是在 `.gitconfig` 設定。 傳輸完成後在我的兩個信箱皆有收到信。 ### 開發者的回覆 開發者有三點回復: * 1. `d = c` 好像是 pointer 和值的比較,可能有問題 * 2. consta (`0xFEFEFEFEFEFEFEFF`) constb (`0x8080808080808080`) 的用途是什麼? * 3. 組合語言有加法(consta),是否需要考慮 overflow 的問題,即確認 CF flag 。另外 `jne` 指令是否要改為 `jnz` ,後者似乎更合適。 經過一番檢視,我的答案如下,目前尚未回覆給開發者: * 1. 開發者可能看錯 `d` 的宣告方式,這應是 `unsigned char` 而非 `unsigned char*` ,一開始的 `unsigned char* src` 讓開發者以為 `d` 是 `pointer` ,實際上不是。(可能要指出 C 語言規格書的說明?) * 2. 可能需要詳細解說演算法,才能解釋其功用 * 3. 這其實是在做減法,即減去 `0x0101010101010101` ,只是為了減少步驟寫成加法。因此 CF flag 不需要處理。 指令寫成 `jnz` 會更有助於理解,但其作用和 `jne` 相同,皆是檢查 Zero Flag ,會做修改。 Andi Kleen 則是要求指出在 Kernel 中具體的效益,目前在研究中 另一位開發者 David Laight 也有提出幾點意見: * 1. alignment 似乎是不必要 * 2. for 迴圈是多餘的,只需保留其中的 `mask |= mask << i` * 3. src 需要綁定 `%rdi` ? * 4. 同時進行 `read-write` 的 register 要用 `+r` ,不要重複寫 * 5. You should also be able to code without needing add, sub and cmp at the end of the loop. * 6. If you use negative offsets from the end of the buffer the loop can be a single `add` and `jnz`. 我經過測試,發現應該是要將 `length` 變成 8 的倍數,因此將 `IS_ALIGN()` 巨集改成 `length & LBLOCKSIZE` 。 `if` 條件判斷中將 `for` 迴圈去掉,將 inline assembly 做一些調整並改寫了一小部分。經過 UML 測試能成功啟動。 但對於 David Laight 最後兩點還是不知如何達成。 我去掉了 alignment 的部分,並且新增一個變數 `dst` ,為執行 word-wise comparison 的上界。因此在組合語言中,不必對 `length` 做減法,只需比較 `src` 和 `dst` 之大小關係,省下一個指令,等於每跑一次迴圈,減少執行一個指令。 ```c // SPDX-License-Identifier: GPL-2.0 #include <linux/string.h> #include <linux/export.h> #include <linux/align.h> /* How many bytes are loaded each iteration of the word copy loop */ #define LBLOCKSIZE (sizeof(long)) #ifdef __HAVE_ARCH_MEMCHR void *memchr(const void *cs, int c, size_t length) { const unsigned char *src = (const unsigned char *)cs; const unsigned char *end = src + length; if (length >= LBLOCKSIZE) { unsigned long mask = c << 8 | c; long xor, data; const long consta = 0xFEFEFEFEFEFEFEFF, constb = 0x8080808080808080; const unsigned char *dst = (const unsigned char *)src + (length & 0xFFFFFFFFFFFFFFF8); /* * Create a 8-bytes mask for word-wise comparing. * For example, a mask for 'a' is 0x6161616161616161. */ mask |= mask << 16; mask |= mask << 32; /* * We perform word-wise comparing with following operation: * 1. Perform xor on the long word @src and @mask * and put into @xor. * 2. Add @xor with @consta. * 3. ~@xor & @constb. * 4. Perform & with the result of step 2 and 3. * * Step 1 creates a byte which is 0 in the long word if * there is at least one target byte in it. * * Step 2 to Step 4 find if there is a byte with 0 in * the long word. */ asm volatile("1:\n\t" "movq (%[src]),%[xor]\n\t" "xorq %[mask],%[xor]\n\t" "lea (%[xor],%[const_a]), %[tmp]\n\t" "notq %[xor]\n\t" "andq %[const_b],%[xor]\n\t" "testq %[xor],%[tmp]\n\t" "jnz 2f\n\t" "add $8,%[src]\n\t" "cmp %[src], %[dst]\n\t" "ja 1b\n\t" "2:\n\t" : [src] "+r"(src), [xor] "+r"(xor), [tmp] "+r"(data) : [const_a] "r"(consta), [const_b] "r"(constb), [mask] "r"(mask), [dst] "r"(dst) : "memory", "cc"); } while (src <= end) { if (*src == c) return (void *)src; src++; } return NULL; } EXPORT_SYMBOL(memchr); #endif ``` David Laight 將我的程式碼改寫,整題而言較為簡潔易懂,沒有組合語言。也解決 target character 為負時會出錯的問題。我認文可以當最終版本 ```c void *memchr(const void *p, int c, unsigned long length) { unsigned long mask, val; const void *end = p + length; c &= 0xff; if (p <= end - 8) { mask = c | c << 8; mask |= mask << 16; mask |= mask << 32; for (; p <= end - 8; p += 8) { val = *(unsigned long *)p ^ mask; if ((val + 0xfefefefefefefeffu) & (~val & 0x8080808080808080u)) break; } } for (; p < end; p++) if (*(unsigned char *)p == c) return p; return NULL; } ``` ### Linux Kernel UML 開機使用的 memchr 長度 在程式碼中加入 ``` printk("le %ld\n", length); ``` 發現執行 UML 時會卡住,因此改寫成 ``` if (length >= 100){ printk("le %d\n", length); } ``` 接著先執行先前編譯好的 UML 執行檔 ``` $ ./linux ``` 擷取部分輸出 ``` ... Linux version 5.18.0 (hihihi@hihihi-VirtualBox) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #37 Mon Jun 6 17:46:51 CST 2022 l 162,10 ... Memory: 26132K/50688K available (3315K kernel code, 1002K rwdata, 1112K rodata, 143K init, 166K bss, 24556K reserved, 0K cma-reserved) l 134,10 ... Console initialized on /dev/tty0 l 162,10 l 134,10 ... ``` 可看到 call `memchr()` 時,字串的長度並不長,大約只有不到 200 個字元。根據先前的測試,優化過的 `memchr()` 的效能優化並不明顯。 :::spoiler ``` [ 0.000000] le 177 [ 0.000000] Command line: BOOT_IMAGE=/boot/vmlinuz-5.18.0 root=UUID=43f42c7b-5930-4dc9-b2f4-48f8220dd500 ro quiet splash [ 0.000000] le 108 ... [ 3.620183] audit: type=1400 audit(1654591700.584:10): apparmor="STATUS" operation="profile_load" profile="unconfined" name="/usr/lib/connman/scripts/dhclient-script" pid=579 comm="apparmor_parser" [ 3.830941] le 177 [ 3.830943] le 108 [ 3.830953] le 104 [ 3.830971] le 103 [ 3.830975] le 115 [ 3.830976] le 110 [ 3.830978] le 142 [ 3.831234] le 104 [ 3.831237] le 132 [ 3.831272] le 115 [ 3.831663] le 123 [ 3.831823] le 114 [ 3.831824] le 120 [ 3.831825] le 241 [ 3.831827] le 227 [ 3.831847] le 150 [ 3.831850] le 101 [ 3.831851] le 156 [ 3.831945] le 109 [ 3.831947] le 166 [ 3.831948] le 161 [ 3.831949] le 155 [ 3.831950] le 174 [ 3.831951] le 159 [ 3.831951] le 172 [ 3.831952] le 189 [ 3.831953] le 182 [ 3.831954] le 185 [ 4.129413] e1000: enp0s3 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX ... [ 9.045064] 08:48:26.011645 main 6.1.14 r140239 started. Verbose level = 0 [ 9.045492] le 127 [ 9.045495] le 126 [ 9.046254] 08:48:26.012759 main vbglR3GuestCtrlDetectPeekGetCancelSupport: Supported (#1) [ 9.186262] systemd-journald[256]: File /var/log/journal/6196ff73bdf74ff4ae61063ebf32f0c3/user-1000.journal corrupted or uncleanly shut down, renaming and replacing. [ 9.189583] le 153 [ 10.149762] Service: Shared Clipboard [ 15.078903] le 177 [ 15.078907] le 108 [ 15.078915] le 104 [ 15.078928] le 103 [ 15.078931] le 115 [ 15.078932] le 110 [ 15.078933] le 142 [ 15.078933] le 177 [ 15.078935] le 108 [ 15.078939] le 104 [ 15.078943] le 104 [ 15.078949] le 132 [ 15.078962] le 103 [ 15.078965] le 115 [ 15.078966] le 110 [ 15.078968] le 142 [ 15.078973] le 104 [ 15.078976] le 132 [ 15.078976] le 115 [ 15.079002] le 115 [ 15.079014] le 123 [ 15.079040] le 123 [ 15.079041] le 114 [ 15.079042] le 120 [ 15.079043] le 241 [ 15.079044] le 227 [ 15.079060] le 150 [ 15.079062] le 101 [ 15.079063] le 156 [ 15.079067] le 114 [ 15.079067] le 120 [ 15.079069] le 241 [ 15.079070] le 109 [ 15.079070] le 227 [ 15.079072] le 166 [ 15.079073] le 161 [ 15.079074] le 155 [ 15.079074] le 174 [ 15.079075] le 159 [ 15.079076] le 172 [ 15.079076] le 189 [ 15.079077] le 182 [ 15.079078] le 185 [ 15.079085] le 127 [ 15.079086] le 150 [ 15.079087] le 126 [ 15.079088] le 101 [ 15.079089] le 153 [ 15.079089] le 156 [ 15.079096] le 109 [ 15.079098] le 166 [ 15.079099] le 161 [ 15.079100] le 155 [ 15.079100] le 174 [ 15.079101] le 159 [ 15.079102] le 172 [ 15.079102] le 189 [ 15.079103] le 182 [ 15.079104] le 185 [ 15.079111] le 127 [ 15.079113] le 126 [ 15.079114] le 177 [ 15.079115] le 153 [ 15.079115] le 108 [ 15.079122] le 104 [ 15.079136] le 103 [ 15.079139] le 115 [ 15.079139] le 110 [ 15.079141] le 142 [ 15.079141] le 177 [ 15.079142] le 108 [ 15.079147] le 104 [ 15.079149] le 132 [ 15.079150] le 104 [ 15.079163] le 103 [ 15.079166] le 115 [ 15.079167] le 110 [ 15.079168] le 142 [ 15.079174] le 104 [ 15.079176] le 115 [ 15.079176] le 132 [ 15.079202] le 115 [ 15.079213] le 123 [ 15.079239] le 114 [ 15.079240] le 123 [ 15.079240] le 120 [ 15.079242] le 241 [ 15.079243] le 227 [ 15.079258] le 150 [ 15.079260] le 101 [ 15.079261] le 156 [ 15.079266] le 114 [ 15.079267] le 120 [ 15.079268] le 109 [ 15.079268] le 241 [ 15.079270] le 227 [ 15.079270] le 166 [ 15.079271] le 161 [ 15.079272] le 155 [ 15.079273] le 174 [ 15.079273] le 159 [ 15.079274] le 172 [ 15.079275] le 189 [ 15.079275] le 182 [ 15.079276] le 185 [ 15.079284] le 127 [ 15.079285] le 126 [ 15.079286] le 150 [ 15.079288] le 101 [ 15.079289] le 156 [ 15.079289] le 153 [ 15.079296] le 109 [ 15.079298] le 166 [ 15.079299] le 161 [ 15.079299] le 155 [ 15.079300] le 174 [ 15.079301] le 159 [ 15.079301] le 172 [ 15.079302] le 189 [ 15.079303] le 182 [ 15.079303] le 185 [ 15.079311] le 127 [ 15.079313] le 126 [ 15.079316] le 153 [ 75.957408] le 177 [ 75.957411] le 108 [ 75.957419] le 104 [ 75.957433] le 103 [ 75.957435] le 115 [ 75.957436] le 110 [ 75.957437] le 142 [ 75.957443] le 104 [ 75.957445] le 132 [ 75.957472] le 115 [ 75.957509] le 123 [ 75.957535] le 114 [ 75.957536] le 120 [ 75.957537] le 241 [ 75.957539] le 227 [ 75.957554] le 150 [ 75.957556] le 101 [ 75.957557] le 156 [ 75.957563] le 109 [ 75.957565] le 166 [ 75.957566] le 161 [ 75.957567] le 155 [ 75.957567] le 174 [ 75.957568] le 159 [ 75.957568] le 172 [ 75.957569] le 189 [ 75.957570] le 182 [ 75.957570] le 185 [ 75.957578] le 127 [ 75.957580] le 126 [ 75.957581] le 153 [ 75.957627] le 177 [ 75.957628] le 108 [ 75.957635] le 104 [ 75.957648] le 103 [ 75.957651] le 115 [ 75.957652] le 110 [ 75.957653] le 142 [ 75.957659] le 104 [ 75.957661] le 132 [ 75.957687] le 115 [ 75.957724] le 123 [ 75.957750] le 114 [ 75.957751] le 120 [ 75.957752] le 241 [ 75.957753] le 227 [ 75.957768] le 150 [ 75.957771] le 101 [ 75.957771] le 156 [ 75.957778] le 109 [ 75.957780] le 166 [ 75.957781] le 161 [ 75.957781] le 155 [ 75.957782] le 174 [ 75.957782] le 159 [ 75.957783] le 172 [ 75.957784] le 189 [ 75.957784] le 182 [ 75.957785] le 185 [ 75.957793] le 127 [ 75.957794] le 126 [ 75.957796] le 153 [ 75.993516] le 177 [ 75.993520] le 108 [ 75.993530] le 104 [ 75.993549] le 103 [ 75.993554] le 115 [ 75.993554] le 110 [ 75.993556] le 142 [ 75.993564] le 104 [ 75.993568] le 132 [ 75.993609] le 115 [ 75.993668] le 123 [ 75.993711] le 114 [ 75.993712] le 120 [ 75.993714] le 241 [ 75.993717] le 227 [ 75.993742] le 150 [ 75.993745] le 101 [ 75.993747] le 156 [ 75.993757] le 109 [ 75.993759] le 166 [ 75.993760] le 161 [ 75.993761] le 155 [ 75.993762] le 174 [ 75.993763] le 159 [ 75.993764] le 172 [ 75.993765] le 189 [ 75.993766] le 182 [ 75.993766] le 185 [ 75.993779] le 127 [ 75.993782] le 126 [ 75.993784] le 153 [ 75.993885] le 177 [ 75.993887] le 108 [ 75.993898] le 104 [ 75.993920] le 103 [ 75.993925] le 115 [ 75.993926] le 110 [ 75.993928] le 142 [ 75.993937] le 104 [ 75.993941] le 132 [ 75.993987] le 115 [ 75.994049] le 123 [ 75.994091] le 114 [ 75.994092] le 120 [ 75.994094] le 241 [ 75.994097] le 227 [ 75.994121] le 150 [ 75.994124] le 101 [ 75.994126] le 156 [ 75.994136] le 109 [ 75.994139] le 166 [ 75.994140] le 161 [ 75.994141] le 155 [ 75.994142] le 174 [ 75.994143] le 159 [ 75.994144] le 172 [ 75.994145] le 189 [ 75.994146] le 182 [ 75.994147] le 185 [ 75.994159] le 127 [ 75.994161] le 126 [ 75.994164] le 153 ``` ::: 以上是將化的 `memchr()` 重新編u4並放入 kernel ,開機後的結果。同樣最多只有 200 多的字元。 :::info 目前 `arch/x86` 目錄中,針對 x86-64 調整過的實作是 memcpy, memset, memmove,但其他的 mem/str 系列函式仍採用 generic 實作。也許可查閱 `arch/x86/lib/string_32.c` 以確認在 Linux 核心使用的狀況。 ::: 我接著在 `/lib/string.c` 的 `memchr` 增加程式碼 ```diff void *memchr(const void *s, int c, size_t n) { + if (n >= 100) { + printk("lib %ld\n", n); + } const unsigned char *p = s; while (n-- != 0) { if ((unsigned char)c == *p++) { return (void *)(p - 1); } } return NULL; } ``` 得到的結果類似先前的測試,顯示 generic 實作似乎沒有被使用。目前看起來 `memchr()` 的使用狀況看我之前的觀察類似,透過 `#define __HAVE_ARCH_MEMCHR` 決定使用哪種 `memchr()` 。由於前述尋找 `memchr()` 皆是在虛擬機上運行。接下來會嘗試在雙系統重新編譯 kernel 並且測試使否有所改變。 嘗試繼續尋找,在 `/drivers/misc/lkdtm/heap.c` 第 207 行找到 ```c=207 if (memchr(val, 0xAB, 512) == NULL) { pr_info("Memory appears initialized (%x, no earlier values)\n", *val); } else { pr_err("FAIL: Slab was not initialized\n"); pr_expected_config_param(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, "init_on_alloc"); } ``` 這裡可以看到要搜尋 512 byte ## 更改貢獻方式,整合 `/lib/string.c` 的 `memchr()` 與 `memchr_inv` 跟老師討論後決定改成貢獻 `lib/string.c` 中的 `memchr()` ,並且和其功能相似但作用相反的涵式 `memchr_inv()` 進行 macro 上的整合。 ### 分析 `memchr_inv()` 功能為找出第一個和不標字元不相同之字元位址,也就是 `memchr()` 相反的功能。 在 `/lib/string.c` 的實作如下 ```c static void *check_bytes8(const u8 *start, u8 value, unsigned int bytes) { while (bytes) { if (*start != value) return (void *)start; start++; bytes--; } return NULL; } void *memchr_inv(const void *start, int c, size_t bytes) { u8 value = c; u64 value64; unsigned int words, prefix; if (bytes <= 16) return check_bytes8(start, value, bytes); value64 = value; #if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 value64 *= 0x0101010101010101ULL; #elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) value64 *= 0x01010101; value64 |= value64 << 32; #else value64 |= value64 << 8; value64 |= value64 << 16; value64 |= value64 << 32; #endif prefix = (unsigned long)start % 8; if (prefix) { u8 *r; prefix = 8 - prefix; r = check_bytes8(start, value, prefix); if (r) return r; start += prefix; bytes -= prefix; } words = bytes / 8; while (words) { if (*(u64 *)start != value64) return check_bytes8(start, value, 8); start += 8; words--; } return check_bytes8(start, value, bytes % 8); } ``` 這裡可看到在字串長度小於 16 字元時,直接呼叫 `check_bytes8()` 尋找第一個不符合的字元。 接著使用一連串的 `#if defined` 來建立 mask 。如果是 64-bits CPU ,其 [`BITS_PER_LONG`](https://elixir.free-electrons.com/linux/v5.18.3/source/include/asm-generic/bitsperlong.h#L9) 會是 64 ,以及有 `ARCH_HAS_FAST_MULTIPLIER` ,直接用乘法建立,像我的 CPU 屬於此類。若是 32-bits 且有 `ARCH_HAS_FAST_MULTIPLIER` ,會先建立 32-bits mask 再用 `|` 建成 64-bits mask。若沒有 `ARCH_HAS_FAST_MULTIPLIER` 則都採用 `|` 建立。 值得注意的是無論是 32-bits 或是 64-bits , `memchr_inv()` 一律採用 64-bits 的 mask 。然而在 32-bits CPU , `u64` 會需要使用兩個 register ,這樣執行效能是否和採用 `unsigned int` 效能一樣需要近一步的測試。 使用 complier explorer 並下 `-m32` 查看 32 bit 和 64 bits 的差別。在 32 bits 下都會用到較多的指令,如: ```c value64 = value; ``` 32 bits ``` movzx eax, BYTE PTR [ebp-13] mov DWORD PTR [ebp-24], eax mov DWORD PTR [ebp-20], 0 ``` 64 bits ``` movzx eax, BYTE PTR [rbp-5] mov QWORD PTR [rbp-16], rax ``` 接下來 `prefix` 變數是用來處理 alignment ,將 unalign 的字元先處理掉。 ```c while (words) { if (*(u64 *)start != value64) return check_bytes8(start, value, 8); start += 8; words--; } ``` 接著 `while` 迴圈對長度為 `8 bytes` 的字串進行比較,若有不同則呼叫 `check_bytes8()` 尋找第一個不同的字元的位址。 最後再處理長度不到 `8 bytes` 的字串。 ### 找出 `memchr` 和 `memchr_inv` 共同處並合併 兩者最明顯的共同之處為皆是要產生一個 word 長度的 mask ,因此可以寫成一個巨集,如下: ```c #if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 #define CREATE_MASK(val) \ val*= 0x0101010101010101ULL; #elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) #define CREATE_MASK(val) \ val *= 0x01010101; \ val |= val << 32; #else #define CREATE_MASK(val) \ val |= val << 8; \ val |= val << 16; \ val |= val << 32; #endif ``` 巨集名稱 `CREATE_MASK` 可能還要修改,目前先暫訂如此進行簡單測試。 由於 `memchr_inv()` 無論是 32-bit 或是 64-bit 皆是採用 `u64` 的型態進行處理,因此在 `memchr()` 也打算用同樣的方式,才能使用共同的巨集。 `memchr()` 變成如下 ```c #if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 #define CREATE_MASK(val) \ val*= 0x0101010101010101ULL; #elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) #define CREATE_MASK(val) \ val *= 0x01010101; \ val |= val << 32; #else #define CREATE_MASK(val) \ val |= val << 8; \ val |= val << 16; \ val |= val << 32; #endif void *memchr(const void *p, int c, unsigned long length) { unsigned long long mask, val; const void *end = p + length; c &= 0xff; if (p <= end - 8) { mask = c; CREATE_MASK(mask); for (; p <= end - 8; p += 8) { val = *(unsigned long *)p ^ mask; if ((val + 0xfefefefefefefeffu) & (~val & 0x8080808080808080u)) break; } } for (; p < end; p++) if (*(unsigned char *)p == c) return (void *)p; return NULL; } ``` 使用以下命令編譯成 x86-32 進行測試 ``` gcc -m32 -O2 -o a.out test.c ``` 成功通過之前使用的正確性測試 接著在 UML 測試修改過的 `/lib/string.c` 能否通過 UML 開機測試。 :::spoiler ```c #if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 #define CREATE_CHAR_MASK(val) val *= 0x0101010101010101ULL; #elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) #define CREATE_CHAR_MASK(val) \ val *= 0x01010101; \ val |= val << 32; #else #define CREATE_CHAR_MASK(val) \ val |= val << 8; \ val |= val << 16; \ val |= val << 32; #endif #ifndef __HAVE_ARCH_MEMCHR /** * memchr - Find a character in an area of memory. * @s: The memory area * @c: The byte to search for * @n: The size of the area. * * returns the address of the first occurrence of @c, or %NULL * if @c is not found */ void *memchr(const void *p, int c, unsigned long length) { u64 mask, val; const void *end = p + length; c &= 0xff; if (p <= end - 8) { mask = c; CREATE_CHAR_MASK(mask); for (; p <= end - 8; p += 8) { val = *(unsigned long *)p ^ mask; if ((val + 0xfefefefefefefeffu) & (~val & 0x8080808080808080u)) break; } } for (; p < end; p++) if (*(unsigned char *)p == c) return (void *)p; return NULL; } EXPORT_SYMBOL(memchr); #endif void *memchr_inv(const void *start, int c, size_t bytes) { u8 value = c; u64 value64; unsigned int words, prefix; if (bytes <= 16) return check_bytes8(start, value, bytes); value64 = value; CREATE_CHAR_MASK(value64); prefix = (unsigned long)start % 8; if (prefix) { u8 *r; prefix = 8 - prefix; r = check_bytes8(start, value, prefix); if (r) return r; start += prefix; bytes -= prefix; } words = bytes / 8; while (words) { if (*(u64 *)start != value64) return check_bytes8(start, value, 8); start += 8; words--; } return check_bytes8(start, value, bytes % 8); } EXPORT_SYMBOL(memchr_inv); ``` ::: ### 重新編譯 kernel 依照之前重新編譯 kernel 的方式將上方修改過的 `memchr()` 以及 `memchr_inv()` ,成功重新開機。 然而,重新編譯只編譯 64-bit 的 kernel , 32-bit kernel 並沒有測試到。由於 Ubuntu 已經沒有 32 bit 的映像檔,因此我選用 Linux Mint 32-bit 來對kernel 進行重新編譯。 首先去官網下載 32-bit 版本。使用 Virtual Box 安裝時選擇 Ubuntu 32-bit 即可。 從 `lscpu` 可看到 ``` CPU op-mode(s): 32-bit ``` 對比 64-bit Ubuntu ``` CPU op-mode(s): 32-bit, 64-bit ``` 這應該是 32-bit 的 Linux 沒錯。 接著重複之前的重新編譯 kernel 的步驟,並註解掉 `/arch/x86/include/asm/string_32.h` 中的 `memchr()` ,以免使用這塊的 `memchr()` 而非 `/lib/string.c` 中的版本。 另外像之前的測試一樣加入 `printk("str %u\n", length)` 觀察是否真的執行到 `memchr()` 以及字串的長度。 重新編譯好後, Linux Mint 成功重新開機,輸入命令 `dmesg` 也觀察到 `memch()` 的輸出,和 64 bit 版本差不多。至此可以確定 32-bit Linux 下 `memchr()` 能正確運行。 ### git blame 查看 `memchr()` 透過git blame 可發現 `mmemchr()` 是 Linus Torvalds 撰寫的,而 `memchr_inv` 是 Akinobu Mita 所撰寫。 :::spoiler ``` ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 869) #ifndef __HAVE_ARCH_MEMCHR ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 870) /** ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 871) * memchr - Find a character in an area of memory. ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 872) * @s: The memory area ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 873) * @c: The byte to search for ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 874) * @n: The size of the area. ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 875) * ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 876) * returns the address of the first occurrence of @c, or %NULL ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 877) * if @c is not found ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 878) */ ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 879) void *memchr(const void *s, int c, size_t n) ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 880) { ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 881) const unsigned char *p = s; ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 882) while (n-- != 0) { ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 883) if ((unsigned char)c == *p++) { 51a0f0f658b0e (Jesper Juhl 2005-10-30 15:02:11 -0800 884) return (void *)(p - 1); ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 885) } ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 886) } ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 887) return NULL; ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 888) } ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 889) EXPORT_SYMBOL(memchr); ^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 890) #endif 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 912) void *memchr_inv(const void *start, int c, size_t bytes) 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 913) { 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 914) u8 value = c; 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 915) u64 value64; 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 916) unsigned int words, prefix; 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 917) 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 918) if (bytes <= 16) 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 919) return check_bytes8(start, value, bytes); 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 920) f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 921) value64 = value; 72d931046030b (Linus Torvalds 2014-09-13 11:14:53 -0700 922) #if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64 3368e8fbcda53 (Andy Shevchenko 2015-11-10 14:45:14 -0800 923) value64 *= 0x0101010101010101ULL; 72d931046030b (Linus Torvalds 2014-09-13 11:14:53 -0700 924) #elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 925) value64 *= 0x01010101; f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 926) value64 |= value64 << 32; f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 927) #else f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 928) value64 |= value64 << 8; f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 929) value64 |= value64 << 16; f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 930) value64 |= value64 << 32; f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 931) #endif 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 932) f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 933) prefix = (unsigned long)start % 8; 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 934) if (prefix) { f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 935) u8 *r; f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 936) f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 937) prefix = 8 - prefix; f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 938) r = check_bytes8(start, value, prefix); 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 939) if (r) 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 940) return r; 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 941) start += prefix; 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 942) bytes -= prefix; 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 943) } 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 944) 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 945) words = bytes / 8; 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 946) 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 947) while (words) { 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 948) if (*(u64 *)start != value64) 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 949) return check_bytes8(start, value, 8); 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 950) start += 8; 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 951) words--; 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 952) } 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 953) 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 954) return check_bytes8(start, value, bytes % 8); 798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 955) } ``` ::: ### 產生 patch patch 1/2: ``` Subject: [PATCH 1/2] lib/string.c: Add a macro for memchr_inv() We add a macro MEMCHR_MASK_GEN() so that both memchr_inv() and memchr() can use it to generate a 8 bytes mask. Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com> Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw> ``` patch 2/2: ``` Subject: [PATCH 2/2] lib/string.c: Optimize memchr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original version of memchr() is implemented with the byte-wise comparing technique, which does not fully use 64-bits or 32-bits registers in CPU. We use word-wide comparing so that 8 characters can be compared at the same time on CPU. This code is base on David Laight's implementation. We create two files to measure the performance. The first file contains on average 10 characters ahead the target character. The second file contains at least 1000 characters ahead the target character. Our implementation of “memchr()” is slightly better in the first test and nearly 4x faster than the orginal implementation in the second test. Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com> Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw> ``` 製作 patch 時,出現以下 CHECK ``` CHECK: Macro argument reuse 'mask' - possible side-effects? #29: FILE: lib/string.c:888: +#define MEMCHR_MASK_GEN(mask) \ + do { \ + mask *= 0x01010101; \ + mask |= mask << 32; \ + } while (0) CHECK: Macro argument 'mask' may be better as '(mask)' to avoid precedence issues #29: FILE: lib/string.c:888: +#define MEMCHR_MASK_GEN(mask) \ + do { \ + mask *= 0x01010101; \ + mask |= mask << 32; \ + } while (0) ``` 按照指示修該產生以下訊息 ``` CHECK: Macro argument reuse 'mask' - possible side-effects? #83: FILE: lib/string.c:888: +#define MEMCHR_MASK_GEN(mask) \ + do { \ + (mask) *= 0x01010101; \ + (mask) |= mask << 32; \ + } while (0) ``` 參考 [Macro Pitfalls](https://stuff.mit.edu/afs/athena/project/rhel-doc/3/rhel-cpp-en-3/macro-pitfalls.html) 第一種情況為優先權問題,假設有一 macro 如下: ```c #define ceil_div(x, y) (x + y - 1) / y ``` 用以下方法呼叫 ```c a = ceil_div (b & c, sizeof (int)); ``` 展開如下 ```c a = (b & c + sizeof (int) - 1) / sizeof (int); ``` 然而,考慮到優先程度,實際上會用以下方式運行 ```c a = (b & (c + sizeof (int) - 1)) / sizeof (int); ``` 而非 ```c a = ((b & c) + sizeof (int) - 1)) / sizeof (int); ``` 第二種情況是擔心 `mask` 可能是 fuction 的回傳值,在 macro 中重複使用會造成多次呼叫 function 。 然而,`MEMCHR_MASK_GEN` 這個 macro 只接受變數輸入,是將 `mask` 中的值展開成 8 bytes ,不可能牽涉到優先權以及 function 呼叫,因此我認為不須理會以上的 CHECK 。 ### 提交 patch 對象 輸入下列命令後 ``` ./scripts/get_maintainer.pl 0001-lib-string.c-Add-a-macro-for-memchr_inv.patch Andy Shevchenko <andy@kernel.org> (reviewer:GENERIC STRING LIBRARY) linux-kernel@vger.kernel.org (open list) ``` 因此寄給的對象可能是 `memchr_inv()` 的作者 Akinobu Mita `akinobu.mita@gmail.com` 以及 Andy Shevchenko `andy@kernel.org` ### 之後的修改 根據老師的提示,有些處理器架構,如:ARM ,並沒有對 unaligned data 進行處理時會有效率問題,因此要針對這一點做處理,也是開發者們在乎的問題。 ```c #define LBLOCKSIZE (sizeof(long)) #if BITS_PER_LONG == 64 #define DETECT_CHAR(X) \ (((X)-0x0101010101010101ULL) & ~(X)&0x8080808080808080ULL) #define MEMCHR_MASK_GEN(mask) \ do { \ (mask) |= (mask) << 8; \ (mask) |= (mask) << 16; \ (mask) |= (mask) << 32; \ } while (0) #else #define DETECT_CHAR(X) (((X)-0x01010101) & ~(X)&0x80808080) #define MEMCHR_MASK_GEN(mask) \ do { \ (mask) |= (mask) << 8; \ (mask) |= (mask) << 16; \ } while (0) #endif void *memchr(const void *p, int c, size_t length) { unsigned long mask, val; const void *end = p + length; c &= 0xff; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS while ((long)p & (sizeof(long) - 1)) { if (p >= end) return NULL; if (*(unsigned char *)p == c) return (void *)p; p++; } #endif if (p <= end - LBLOCKSIZE) { mask = c; MEMCHR_MASK_GEN(mask); for (; p <= end - LBLOCKSIZE; p += LBLOCKSIZE) { val = *(unsigned long *)p ^ mask; if (DETECT_CHAR(val)) break; } } for (; p < end; p++) if (*(unsigned char *)p == c) return (void *)p; return NULL; } EXPORT_SYMBOL(memchr); ``` ```c #if BITS_PER_LONG == 64 #define MEMCHR_MASK_GEN(mask) \ do { \ (mask) |= (mask) << 8; \ (mask) |= (mask) << 16; \ (mask) |= (mask) << 32; \ } while (0) #else #define MEMCHR_MASK_GEN(mask) \ do { \ (mask) |= (mask) << 8; \ (mask) |= (mask) << 16; \ } while (0) #endif void *memchr(const void *p, int c, size_t length) { unsigned long mask; unsigned char *src = (unsigned char *) p; const void *end = p + length; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; size_t max = length; c &= 0xff; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS /* * If src is unaligned, don't cross a page boundary, * since we don't know if the next page is mapped. */ if ((long)p & (sizeof(long) - 1)) { size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1)); if (limit < max) max = limit; } #else /* If src is unaligned, don't do word-at-a-time. */ if (((long) p) & (sizeof(long) - 1)) max = 0; #endif mask = c; MEMCHR_MASK_GEN(mask); for ( ; max >= sizeof(unsigned long); max -= sizeof(unsigned long), src += sizeof(unsigned long)) { unsigned long data, result; data = read_word_at_a_time(src); data = data ^ mask; if (has_zero(data, &result, &constants)) { break; } } for (; src < end; src++) if (*src == c) return (void *)src; return NULL; } ```