contribute by < arthurchang09
>
下方組合語言可以順利完成 alignment
的部分,相當於 memchr_opt
第一個 while
迴圈,
void *func(const void *src_void, int c, size_t length){
const unsigned char *src = (const unsigned char *) src_void;
unsigned char d = c;
long al;
char test;
asm volatile(
"jmp align\n\t"
"loop:\n\t"
"length:\n\t"
"decq %1\n\t"
"testq %1,%1\n\t"
"jne equal\n\t"
"mov $0,%0\n\t"
"jmp end\n\t"
"equal:\n\t"
"mov (%0),%3\n\t"
"cmp %%al,%3\n\t"
"je end\n\t"
"incq %0\n\t"
"align:\n\t"
"mov %0,%2\n\t"
"andq $7,%2\n\t"
"testq %2,%2\n\t"
"jne loop\n\t"
: "=D" (src), "=&c" (length),"=r"(al),"=r"(test)
: "a" (c), "0" (src), "1" (length), "2"(al),"3"(test)
: "memory","cc");
asm volatile(
"end:\n\t"
: "=D" (src), "=&c" (length),"=r"(al),"=r"(test)
: "a" (c), "0" (src), "1" (length), "2"(al),"3"(test)
: "memory","cc");
return (void *) src;
目前已將 alignment 與 DETEC_CHAR
巨集的部分改寫成以下 inline assembly 。
void *fun(const void *src_void, int c, size_t length){
const unsigned char *src = (const unsigned char *) src_void;
unsigned char d = c;
long al;
char test;
asm volatile(
"jmp align\n\t"
"loop:\n\t"
"length:\n\t"
"decq %1\n\t"
"testq %1,%1\n\t"
"jne equal\n\t"
"mov $0,%0\n\t"
"jmp end\n\t"
"equal:"
"mov (%0),%3\n\t"
"cmp %%al,%3\n\t"
"je end\n\t"
"incq %0\n\t"
"align:\n\t"
"mov %0,%2\n\t"
"testq $7,%2\n\t"
"jne loop\n\t"
: "=D" (src), "=&c" (length),"=r"(al),"=r"(test)
: "a" (c), "0" (src), "1" (length), "2"(al),"3"(test)
: "memory","cc");
if (!TOO_SMALL(length)) {
unsigned long *asrc = (unsigned long *) src;
unsigned long mask = d << 8 | d;
mask = mask << 16 | mask;
for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
mask = (mask << i) | mask;
unsigned long xor, consta = 0xFEFEFEFEFEFEFEFF,constb = 0x8080808080808080;
unsigned long mask4 = mask;
asm volatile(
"detect_loop:\n\t"
"movq %6,%1\n\t"
"xorq (%0),%1\n\t"
"movq %1,%2\n\t"
"addq %4,%1\n\t"
"notq %2\n\t"
"andq %5,%2\n\t"
"andq %1,%2\n\t"
"testq %2,%2\n\t"
"jne break_detect\n\t"
"sub $8,%3\n\t"
"add $8,%0\n\t"
"cmp $7,%3\n\t"
"jae detect_loop\n\t"
:"=D"(asrc),"=r"(mask),"=r"(xor),"=r"(length)
:"r"(consta),"r"(constb),"r"(mask4),"0"(asrc),"1"(mask),"2"(xor),"3"(length)
:"memory");
asm volatile(
"break_detect:\n\t"
:"=D"(asrc),"=r"(mask),"=r"(xor),"=r"(length)
:"r"(consta),"r"(constb),"r"(mask4),"0"(asrc),"1"(mask),"2"(xor),"3"(length)
:"memory");
src = (unsigned char *) asrc;
}
asm volatile("repne\n\t"
"scasb\n\t"
"je 1f\n\t"
"mov $1,%0\n"
"1:\tdec %0"
: "=D" (src), "=&c" (length)
: "a" (c), "0" (src), "1" (length)
: "memory");
asm volatile(
"end:\n\t"
: "=D" (src), "=&c" (length),"=r"(al),"=r"(test)
: "a" (c), "0" (src), "1" (length), "2"(al),"3"(test)
: "memory","cc");
return (void *) src;
}
效能與正確性測試程式如下
int main(){
char str2[1000];
memset(str2,'0', sizeof(char)*1000);
str[999] = '\0';
char ch3 = '4';
char *ret, *ret1, *ret3;
for (int i = 0; i < 999; ++i) {
memset(str2,'0', sizeof(char)*1000);
str2[i] = '4';
struct timespec t1,t2,t3,t4,t5,t6;
ret = memchr_opt(str2,ch3,strlen(str2));
clock_gettime(CLOCK_REALTIME, &t1);
for (int j = 0; j < 100; ++j){
ret = memchr_opt(str2,ch3,strlen(str2));
}
clock_gettime(CLOCK_REALTIME, &t2);
clock_gettime(CLOCK_REALTIME, &t3);
for (int j = 0; j < 100; ++j){
ret1 = fun(str2,ch3,strlen(str2));
}
clock_gettime(CLOCK_REALTIME, &t4);
clock_gettime(CLOCK_REALTIME, &t5);
for (int j = 0; j < 100; ++j){
ret3 = memchr(str2,ch3,strlen(str2));
}
clock_gettime(CLOCK_REALTIME, &t6);
printf("%d %ld %ld %ld\n", i, (t2.tv_nsec - t1.tv_nsec) / 100, (t4.tv_nsec - t3.tv_nsec) / 100, (t6.tv_nsec - t5.tv_nsec) / 100);
if (strlen(ret) != strlen(ret1)){
printf("error\n");
} else {
for (int k = 0; k < strlen(ret); ++k){
if (ret[k] != ret1[k]){
printf("error2");
break;
}
}
}
}
return 0;
}
編譯
gcc -o a.out test.c
初步在虛擬機 (CPU i7-10700K) userspace 之效能如下圖
雖然效能不及 default-memchr
,但已比 memchr-opt
運行速度更快。
預計採用 fibdrv
方式寫成 Linux character device driver,使用 copy_from_user
將字串從 userspace
複製到 kernel 放入 kmalloc
要到的記憶體空間,最後將結果使用 copy_to_user
回傳到 userspace
,並將執行時間用 return 的方式回傳。
將 alignment 的部分稍作修改,省一次 jmp
。
asm volatile(
"testq $7,%0\n\t"
"je end_align\n\t"
"loop:\n\t"
"length:\n\t"
"decq %1\n\t"
"testq %1,%1\n\t"
"jne equal\n\t"
"mov $0,%0\n\t"
"jmp end\n\t"
"equal:"
"mov (%0),%2\n\t"
"cmp %%al,%2\n\t"
"je end\n\t"
"incq %0\n\t"
"align:\n\t"
"testq $7,%0\n\t"
"jne loop\n\t"
"end_align:\n\t"
: "=D" (src), "=&c" (length),"=r"(test)
: "a" (c), "0" (src), "1" (length),"2"(test)
: "memory","cc");
考慮到之後會在 LKML 提交圖表,標示方式應該統一:
glibc-memchr-x86
: glibc 裡頭 sysdep/x86 的 memchr
實作linux-memchr-x86
: 原本 Linux 核心的 x86 memchr
實作memchr-x86-opt
: 上述針對 Linux 核心 x86 memchr
的效能改進實作回顧 inline assmebly 程式碼,用 gcc -S test.c
觀察組合語言,發現每一段 inline assmebly 前都會將原本 register 的植存回記憶體,我的實作又有數個分段的 inline assembly ,因此最佳的方式只要一塊 inline assembly 。
.L23:
cmpl $63, -60(%rbp)
jbe .L24
movabsq $-72340172838076673, %rax
movq %rax, -32(%rbp)
movabsq $-9187201950435737472, %rax
movq %rax, -24(%rbp)
movq -32(%rbp), %r8
movq -24(%rbp), %r9
movq -48(%rbp), %r10
movq -40(%rbp), %rsi
movq -16(%rbp), %rcx
movq -8(%rbp), %rdx
movq -88(%rbp), %rax
movq %rsi, %rdi
#APP
# 180 "test.c" 1
detect_loop:
movq (%rdi),%rcx
xorq %r10,%rcx
lea (%rcx,%r8), %rdx
notq %rcx
andq %r9,%rcx
testq %rcx,%rdx
jne break_detect
sub $8,%rax
add $8,%rdi
cmp $7,%rax
jae detect_loop
經過數次對比測試,發現去掉下列部分組合語言:
asm volatile("repne\n\t"
"scasb\n\t"
"je 1f\n\t"
"mov $1,%0\n"
"1:\tdec %0"
: "=D" (src), "=&c" (length)
: "a" (c), "0" (src), "1" (length)
: "memory");
去掉開頭 alignment 的組合語言在不開啟編譯器優化下效能沒差。但先去掉以節省將 register 存回 memory 的操作。
在虛擬機中 Linux 核心之效能表現不錯。和 glibc-memchr-x86
實作比較上,效能也有些許勝出,如下圖所示:
以上測試皆在虛擬機上,轉移到實體電腦測試(CPU i5-7200U)。
參照 std::find() and memchr() Optimizations 一文,使用其效能分析的手法,對上述 memchr-x86-opt
和相關實作進行評比。
接著參考 std::find() and memchr() Optimizations , 其採用 128 KiB
的 char
buffer 儲存字串。會選擇這個大小是因為會有更好的 throughput ,參見 GNU 註解 ,我先在虛擬機是跑裡面的 script
for i in $(seq 0 10); do
bs=$((1024*2**$i))
printf "%7s=" $bs
timeout --foreground -sINT 2 \
dd bs=$bs if=/dev/zero of=/dev/null 2>&1 \
| sed -n 's/.* \([0-9.]* [GM]B\/s\)/\1/p'
done
得到以下輸出:
bash throughput.sh
1024=3.6 GB/s
2048=6.5 GB/s
4096=11.0 GB/s
8192=16.2 GB/s
16384=21.4 GB/s
32768=22.1 GB/s
65536=26.1 GB/s
131072=28.0 GB/s
262144=27.3 GB/s
524288=27.1 GB/s
1048576=28.3 GB/s
筆電雙系統
1024=916 MB/s
2048=1.8 GB/s
4096=3.3 GB/s
8192=5.7 GB/s
16384=9.0 GB/s
32768=11.5 GB/s
65536=15.0 GB/s
131072=17.0 GB/s
262144=17.4 GB/s
524288=17.8 GB/s
1048576=18.0 GB/s
每次執行執會稍有不同,但可看出大約在
此文作者程式碼主要是測試較為破碎的字串,輸入資料擷取部分如下:
W,w,WW,WWW,WY|Y,y,A
a,AA,AAA,aah,ah|AI,air,AR
Ar,Au,aw,E,e|ea,ear,EEO
e'er,eh,EOE,ER,Er|er,EU,Eu
Eur,I,i,IA,Ia|IE,IEEE,ii
iii,Io,IOU,Ir,O|o,oar,OE
o'er,OH,oh,oi,ooh|OR,or,our
ow,U,u,UAR,UAW|ugh,uh,Ur
測試程式會用一次 find()
,和 memchr
功能相近的程式,去找每一行的 \n
的位址,再用一次 find()
找到 |
。以上方的資料為例,第一次尋找會找到 W,w,WW,WWW,WY|Y,y,A
,第二次會找到 |Y,y,A
。接著移向下一行再尋找,直到 buffer ,即暫存所有資料的 array 走到盡頭,接著會再繼續從檔案讀取下一波 131072
byte 資料。因此,估計在不開優化的情形下 linux-memchr-x86
的表現不會太差。
以下為改寫成 C 之測試程式碼
int main()
{
char buffer[128*1024 + 64];
int N = 128 * 1024;
int n = 3000;
FILE* fd = fopen("in", "r");
size_t r, off = 0;
char *e, *x = buffer, *y;
const char *b1 = buffer + N;
for (int i = 0; i < n; ++i) {
fseek(fd, 0, SEEK_SET);
while (1)
{
off = 0;
r = fread(buffer, sizeof(char), N, fd);
char *b = buffer;
for (;;) {
x = memchr_x86(b, '\n', b1 - x);
if (!x) {
break;
}
y = memchr_x86(b, '|', x - b);
*x = '\0';
off += y - b;
if (y != x) {
printf("%ld\n", off);
}
if (x >= b1) {
break;
}
b = x + 1;
off = 0;
}
if (r != N) {
break;
}
}
}
}
採用 command line 的 time
測量整個程式的執行時間,不開啟編譯器優化
time taskset 0x01 ./b.out
memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |
---|---|---|---|
real time | 11.33825 | 11.72425 | 13.076 |
user time | 11.0325 | 11.406 | 12.726 |
sys time | 0.2935 | 0.32425 | 0.344 |
由上方表格可看出在短字串下 linux-memchr-x86
表現比 glibc-memchr-x86
好
接下來測試開啟編譯器優化,由於先前 inline assembly 的 label 是採用文字,造成開啟 -O3
優化時會出現錯誤,因此在此修改 label ,改用數字表示,且在 jump
instruction 後的 label 要加上 f
或 b
表示會往下方或上方的 label 進行 jump ,使編譯器 unroll 時不會發生錯誤,如下:
void *fun1(const void *src_void, int c, size_t length){
const unsigned char *src = (const unsigned char *) src_void;
unsigned char d = c;
long al;
char test;
while (UNALIGNED(src)) {
if (!length--)
return NULL;
if (*src == d)
return (void *) src;
src++;
}
if (!TOO_SMALL(length)) {
unsigned long *asrc = (unsigned long *) src;
unsigned long mask = d << 8 | d;
mask = mask << 16 | mask;
for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
mask = (mask << i) | mask;
unsigned long xor, consta = 0xFEFEFEFEFEFEFEFF,constb = 0x8080808080808080;
unsigned long data;
asm volatile(
"1:\n\t"
"movq (%0),%1\n\t"
"xorq %6,%1\n\t"
"lea (%1,%4), %2\n\t"
"notq %1\n\t"
"andq %5,%1\n\t"
"testq %1,%2\n\t"
"jne 2f\n\t"
"sub $8,%3\n\t"
"add $8,%0\n\t"
"cmp $7,%3\n\t"
"jae 1b\n\t"
"2:\n\t"
:"=D"(asrc),"=r"(data),"=r"(xor),"=r"(length)
:"r"(consta),"r"(constb),"r"(mask),"0"(asrc),"1"(data),"2"(xor),"3"(length)
:"memory");
src = (unsigned char *) asrc;
}
while (length--) {
if (*src == d){
return (void *) src;
}
src++;
}
return (void *) NULL;
}
memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |
---|---|---|---|
real time | 8.34025 | 11.1175 | 8.4965 |
user time | 8.01525 | 10.82 | 8.23925 |
sys time | 0.312 | 0.291 | 0.257 |
開啟 -O3
後, 三者的速度都有所加快。其中 glibc-memchr-x86
之速度加快很多,但仍較 memchr-x86-opt
慢。
接著回去看組合語言,兩者皆是將函式直接放入 int main()
中,因此 funtion call 的成本兩者應該是相近的,其中並未使用 loop unrolling 的方式優化。因此推測是 inline assmebly 的部分帶來的加速效果。因此我使用原本測驗一的程式碼再跑一次測試,並使用 perf
比較是否是組合語言達成這個效果,如下:
memchr-x86-opt(assembly)
sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null
Performance counter stats for 'taskset 0x01 ./b.out' (10 runs):
205,4433,7709 cycles ( +- 0.02% ) (66.63%)
447,8938,8415 instructions # 2.18 insn per cycle ( +- 0.00% ) (83.32%)
282,0384 cache-misses # 1.288 % of all cache refs ( +- 7.92% ) (83.33%)
2,1897,1894 cache-references ( +- 1.35% ) (83.35%)
2,6449,1465 branch-misses ( +- 0.01% ) (83.36%)
96,5328,5976 branch ( +- 0.00% ) (83.33%)
8.25079 +- 0.00243 seconds time elapsed ( +- 0.03% )
memchr-x86-opt(original)
sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null
Performance counter stats for 'taskset 0x01 ./b.out' (10 runs):
218,3588,7203 cycles ( +- 0.05% ) (66.66%)
448,9144,0365 instructions # 2.06 insn per cycle ( +- 0.00% ) (83.34%)
342,8490 cache-misses # 1.737 % of all cache refs ( +- 8.35% ) (83.34%)
1,9738,5499 cache-references ( +- 0.89% ) (83.34%)
2,5664,0313 branch-misses ( +- 0.02% ) (83.34%)
95,5833,8661 branch ( +- 0.00% ) (83.32%)
8.77154 +- 0.00486 seconds time elapsed ( +- 0.06% )
glibc-memchr-x86
sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null
Performance counter stats for 'taskset 0x01 ./b.out' (10 runs):
211,8621,2328 cycles ( +- 0.03% ) (66.64%)
444,6003,3863 instructions # 2.10 insn per cycle ( +- 0.00% ) (83.33%)
335,5993 cache-misses # 1.595 % of all cache refs ( +- 5.27% ) (83.34%)
2,1047,1090 cache-references ( +- 1.85% ) (83.34%)
2,5254,3590 branch-misses ( +- 0.00% ) (83.34%)
97,1084,9600 branch ( +- 0.00% ) (83.34%)
8.50679 +- 0.00295 seconds time elapsed ( +- 0.03% )
linux-memchr-x86
sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null
Performance counter stats for 'taskset 0x01 ./b.out' (10 runs):
280,0662,7457 cycles ( +- 0.09% ) (66.65%)
309,2942,6350 instructions # 1.10 insn per cycle ( +- 0.01% ) (83.33%)
1548,9334 cache-misses # 7.536 % of all cache refs ( +- 3.03% ) (83.33%)
2,0553,4812 cache-references ( +- 1.52% ) (83.34%)
72,7369 branch-misses ( +- 0.67% ) (83.34%)
61,5698,8166 branch ( +- 0.01% ) (83.34%)
11.2574 +- 0.0114 seconds time elapsed ( +- 0.10% )
整理如下:
memchr-x86-opt(assembly) | memchr-x86-opt(original) | glibc-memchr-x86 | linux-memchr-x86 | |
---|---|---|---|---|
time(s) | 8.25079 ± 0.00243 | 8.77154 ± 0.00486 | 8.50679 ± 0.00295 | 11.2574 ± 0.0114 |
instructions | 447,8938,8415 | 448,9144,0365 | 444,6003,3863 | 309,2942,6350 |
cache-misses | 282,0384 | 342,8490 | 335,5993 | 1548,9334 |
insn per cycle | 2.18 | 2.06 | 2.10 | 1.10 |
相較於測驗一原始程式,可以看出我修改的 inline assembly 在 IPC(instruction per cycle) 以及 cache-misses 有所領先。
以上的測試主要是針對較短字串,若目標字元在整個字串相對靠後的位置,表現應該會有所變化。接下來我對輸入資料做修改,在 |
之前至少有 200 個單字,後面有 1 ~ 50 個單字,每個單字平均長度以 5 個字元計算,包括逗號,至少需比較 1199 個字元。
資料產生的方式為以下 word_gen.py
,從 /usr/share/dict/words
中隨機選取單詞組合
#!/usr/bin/env python3
import random
import string
words = open('/usr/share/dict/words').read().splitlines()
ranstr = ''
f = open('in2.txt', 'w')
for i in range(15833):
ranstr1 = ''
lengh = random.randint(200,400)
for j in range(lengh):
ranstr += words[random.randint(0,len(words) - 1)]
if (j < lengh - 1):
ranstr += ','
ranstr += '|'
lengh = random.randint(1,50)
for j in range(lengh):
ranstr += words[random.randint(0,len(words) - 1)]
if (j < lengh - 1):
ranstr += ','
print(ranstr, file = f)
ranstr = ''
f.close()
範例字串
dawns,gratitude's,Houdini,barnyard,OHSA's,Florence's,rehabilitates,foulest,conquistadores,divide's,insert's,ramifications
開啟 -O3
優化 結果
memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |
---|---|---|---|
real time | 1m 5.57875 | 4m 20.38075 | 1m 16.8215 |
user time | 45.76925 | 3m 57.2797 | 55.55025 |
sys time | 19.73975 | 22.98525 | 21.2315 |
未開啟優化 :
memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |
---|---|---|---|
real time | 1m 30.063 | 4m 20.9525 | 2m 28.592 |
user time | 1m 8.37275 | 3m 57.7035 | 2m 6.033 |
sys time | 21.73575 | 23.12075 | 22.484 |
由以上表格可看出在不開優化下 linux-memchr-x86
在長字串的搜尋就沒有比 glibc-memchr-x86
快。
由於 time
這個命令可能會將 ELF loader 和 VMA 的成本都算進去,因此改用 struct timespec
來測量時間這 3000 次 iteration 總共花的時間。
結果如下:
開啟 -O3
memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |
---|---|---|---|
時間(s) | 65.403 | 259.202 | 65.284 |
未開啟優化
memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |
---|---|---|---|
時間(s) | 89.296 | 259.409 | 147.488 |
意外的是開啟 -O3
時 glibc-memchr-x86
這次的測試成績居然較 memchr-x86-opt
佳,大約可以快約 0.1 ~ 0.8 秒,因此我用同樣的方式重新測量了一次時間,結果 glibc-memchr-x86
時間又回到之前採用 time
命令的長度,反而 memchr-x86-opt
的執行時間沒有太大改變,維持 65 秒上下,不太清楚為何有這 10 秒的差距。
memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |
---|---|---|---|
時間(s) | 65.398 | 259.214 | 76.24 |
接著我在程式碼不同的位置放上 clock_gettime
測量不同區段的執行時間。首先,放在上方程式碼 17 行 for
迴圈前後,將每次跑迴圈的時間累加起來 。接著是在 while
迴圈前後,將每次檔案處理完的時間加起來,比較這其中的差距。
逐次 for
迴圈時間總和
memchr-x86-opt | glibc-memchr-x86 | |
---|---|---|
時間(s) | 45.5391 | 44.93 |
逐次 while
迴圈時間總和
memchr-x86-opt | glibc-memchr-x86 | |
---|---|---|
時間(s) | 69.74045 | 76.209 |
明顯觀察到 for
迴圈的執行時間 glibc-memchr-x86
較快,而 while
迴圈中則變慢,且 while
迴圈的測量數據中, memchr-x86-opt
竟然較測量 3000 次 iteration 長,明顯測量上有問題。
接著我測量跑完單一檔案所花的時間,也就是 while
迴圈前後的位置放上 clock_gettime
,但只執行一次就跳出最外層的 for
迴圈,結果如下:
memchr-x86-opt | glibc-memchr-x86 | |
---|---|---|
時間(s) | 0.02173 | 0.02484 |
上方表格的數字乘 3000 就和使用 time
以及測量 3000 iteration 的時間相近,
跟老師討論後,認為時間的差距有可能是因為 CPU0 存在一些特定的裝置驅動或 kthread,造成測量的時間有誤,因此重新限定 CPU 給特定的程式使用。我的 CPU 為 2 核 4 緒,因此將 CPU2 和 CPU 3 獨立出來。
進入 /etc/default/grub
sudo vim /etc/default/grub
看到以下這行
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
改成
GRUB_CMDLINE_LINUX_DEFAULT="quiet splash isolcpus=2,3"
在筆電的 BIOS 翻了一圈並沒有相關選項,因此無法關閉。推測可能是因為筆電的 BIOS 限制較嚴格,加上我的筆電 CPU 其實只有兩個實體核,故無法關閉。
為了去除可能的誤差,我按照老師的提示,運用統計手法去除極端值,並開啟 -O2
優化。
首先,測量一次檔案的時間,總共取 1000 次,並且取 95% 信賴區間。
memchr-x86-opt | linux-memchr-x86 | glibc-memchr-x86 | |
---|---|---|---|
時間範圍(s) | 0.02190 ~ 0.02201 | 0.08536 ~ 0.08713 | 0.02256 ~0.02275 |
可看出在 -O2
優化下, memchr-x86-opt
和 glibc-memchr-x86
效能是相差無幾,兩者的核心概念是相同,只是實作方式稍有不同。我所新增的組合語言也是基於 -O3
的組合語言改寫,因此效能差不多是在意料之中。
相較於 linux-memchr-x86
, memchr-x86-opt
的執行時間是 前者的
UML 之核心版本為 5.15,memchr
反組譯的結果如下:
$ objdump -xd linux | grep -A12 "<memchr>"
000000006020a91e <memchr>:
6020a91e: f3 0f 1e fa endbr64
6020a922: 48 89 f8 mov %rdi,%rax
6020a925: 48 01 fa add %rdi,%rdx
6020a928: 48 39 d0 cmp %rdx,%rax
6020a92b: 74 0e je 6020a93b <memchr+0x1d>
6020a92d: 48 8d 48 01 lea 0x1(%rax),%rcx
6020a931: 40 38 30 cmp %sil,(%rax)
6020a934: 74 07 je 6020a93d <memchr+0x1f>
6020a936: 48 89 c8 mov %rcx,%rax
6020a939: eb ed jmp 6020a928 <memchr+0xa>
6020a93b: 31 c0 xor %eax,%eax
6020a93d: c3 retq
簡單觀察組合語言, lea
將資料讀入 register , cmp
將讀入的資料和字元做比較,其中 %sil
是一個 8-bits register ,若相同,則 Zero Flag 設起,會跳到 req
回傳值。因此這部分的組合語言為 byte-wise comparing 。
以下組合語言輸出命令如下
gcc -S asm.c
gcc -S -O2 asm.c
並使用 diff
命令比較組合語言差異
if (!TOO_SMALL(length))
中的部分,多了 movq -56(%rbp), %rax
movq %rax, -40(%rbp)
也就是 unsigned long *asrc = (unsigned long *) src;
這段程式碼的組合語言。
但開啟 -O2
優化後,這部分被優化掉,有沒有 上述這段程式碼不影響組合語言。
-O2
優化時也沒有差異。我直接 git clone
Linux 在 Github 上的專案。
git blame arch/x86/lib/string_32.c
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 180) #ifdef __HAVE_ARCH_MEMCHR
8cf36d2bc5832 arch/x86/lib/string_32.c (Paolo Ciarrocchi 2008-02-19 23:09:59 +0100 181) void *memchr(const void *cs, int c, size_t count)
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 182) {
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 183) int d0;
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 184) void *res;
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 185) if (!count)
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 186) return NULL;
8cf36d2bc5832 arch/x86/lib/string_32.c (Paolo Ciarrocchi 2008-02-19 23:09:59 +0100 187) asm volatile("repne\n\t"
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 188) "scasb\n\t"
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 189) "je 1f\n\t"
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 190) "movl $1,%0\n"
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 191) "1:\tdecl %0"
3492cdf0176bd arch/x86/lib/string_32.c (Paolo Ciarrocchi 2008-08-02 21:25:13 +0200 192) : "=D" (res), "=&c" (d0)
3492cdf0176bd arch/x86/lib/string_32.c (Paolo Ciarrocchi 2008-08-02 21:25:13 +0200 193) : "a" (c), "0" (cs), "1" (count)
3492cdf0176bd arch/x86/lib/string_32.c (Paolo Ciarrocchi 2008-08-02 21:25:13 +0200 194) : "memory");
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 195) return res;
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 196) }
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 197) EXPORT_SYMBOL(memchr);
b520b85a963bf arch/i386/lib/string.c (Andi Kleen 2007-07-21 17:09:59 +0200 198) #endif
memchr 基本上由 Andi Kleen 和 Paolo Ciarrocchi 變更 memchr
這段程式碼,主要內容還是由 Andi Kleen 撰寫。
memchr
的改進描述,可參見 arm64: Better optimised memchr()
Although we implement our own assembly version of
memchr()
, it turns out to be barely any better than what GCC can generate for the generic C version (and would go wrong if thesize_t
argument were ever large enough to be interpreted as negative). Unfortunately we can't import the tuned implementation from the Arm optimized-routines library, since that has some Advanced SIMD parts which are not really viable for general kernel library code. What we can do, however, is pep things up with some relatively straightforward word-at-a-time logic for larger calls.Adding some timing to optimized-routines'
memchr()
test for a simple benchmark, overall this version comes in around half as fast as the SIMD code, but still nearly 4x faster than our existing implementation.
commit 9e51cafd783b22018fb15bfb06d65f69349223a9
思考中…
The original assembly version of memchr()
is implemented with the byte-wise comparing technic, which is not fully using whole 64-bits registers in x86-64 CPU. We use word-wide comparing so that 8 characters can be compared at the same time on x86-64 CPU. First we align the input and then use word-wise comparing to find the first 64-bit word that content the target. Secondly, we compare every byte in the word and get the output.
We create two files to measure the performance. The first file contains on average 10 characters ahead the target character. The second file contains at least 1000 characters ahead the target character. It turns out that "memchr()" is slightly better in the first test and nearly 4x faster than the orginal implementation in the second test.
memchr-opt-x86
可能的添加方式以下節錄 /include/linux/string.h
以及 /lib/string.c
#ifndef __HAVE_ARCH_MEMCHR
extern void * memchr(const void *,int,__kernel_size_t);
#endif
#ifndef __HAVE_ARCH_MEMCHR
/**
* memchr - Find a character in an area of memory.
* @s: The memory area
* @c: The byte to search for
* @n: The size of the area.
*
* returns the address of the first occurrence of @c, or %NULL
* if @c is not found
*/
void *memchr(const void *s, int c, size_t n)
{
const unsigned char *p = s;
while (n-- != 0) {
if ((unsigned char)c == *p++) {
return (void *)(p - 1);
}
}
return NULL;
}
EXPORT_SYMBOL(memchr);
#endif
可看到若無 def __HAVE_ARCH_MEMCHR
,則會使用 string.h
中的 memchr
。
追溯 __HAVE_ARCH_MEMCHR
,發現在 arch
目錄下各個架構 CPU ,如 arm 、 arm64 、 powerpc 、 x86 等的 string.h
都有,其中 x86 的部分只有在 string_32.h
、 string_32.c
中有 define。
接著觀察 x86 這兩個檔案,擷取如下:
#define __HAVE_ARCH_MEMCHR
extern void *memchr(const void *cs, int c, size_t count);
#ifdef __HAVE_ARCH_MEMCHR
void *memchr(const void *cs, int c, size_t count)
{
int d0;
void *res;
if (!count)
return NULL;
asm volatile("repne\n\t"
"scasb\n\t"
"je 1f\n\t"
"movl $1,%0\n"
"1:\tdecl %0"
: "=D" (res), "=&c" (d0)
: "a" (c), "0" (cs), "1" (count)
: "memory");
return res;
}
EXPORT_SYMBOL(memchr);
#endif
string_32.h
中 define __HAVE_ARCH_MEMCHR
,並在 string_32.c
中判斷是否已 define ,並且實作程式碼。
接著再看 string_64.h
,其內容也類似上述描述,都是 #define __HAVE_ARCH_XXXX
接著 function 的宣告,只是沒有統一的 string_64.c
,而是各個 .s
file 的組合語言檔案。因此,memchr-opt-x86
要加入且暫時不管 x86-32
的話,應該是在 string_64.h
做出類似前述的 #defin
,並且在 /arch/x86/lib
建立一個對應的 .c
檔。
之後會嘗試在 UML 上修改並測試。
增加下面這兩行
#define __HAVE_ARCH_MEMCHR
extern void *memchr(const void *cs, int c, size_t count);
並將我的 memchr-opt-x86
新增成 string_64.c
,如下:
#define __NO_FORTIFY
#include <linux/string.h>
#include <linux/export.h>
/* Nonzero if either X or Y is not aligned on a "long" boundary */
#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))
/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))
/* Threshhold for punting to the bytewise iterator */
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
#ifdef __HAVE_ARCH_MEMCHR
void *memchr(const void *src_void, int c, size_t length){
const unsigned char *src = (const unsigned char *) src_void;
unsigned char d = c;
while (UNALIGNED(src)) {
if (!length--)
return NULL;
if (*src == d)
return (void *) src;
src++;
}
if (!TOO_SMALL(length)) {
//unsigned long *asrc = (unsigned long *) src;
unsigned long mask = d << 8 | d;
unsigned int i = 32;
mask = mask << 16 | mask;
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
mask = (mask << i) | mask;
unsigned long xor, consta = 0xFEFEFEFEFEFEFEFF,constb = 0x8080808080808080;
unsigned long data;
asm volatile(
"1:\n\t"
"movq (%0),%1\n\t"
"xorq %6,%1\n\t"
"lea (%1,%4), %2\n\t"
"notq %1\n\t"
"andq %5,%1\n\t"
"testq %1,%2\n\t"
"jne 2f\n\t"
"sub $8,%3\n\t"
"add $8,%0\n\t"
"cmp $7,%3\n\t"
"jae 1b\n\t"
"2:\n\t"
:"=D"(src),"=r"(data),"=r"(xor),"=r"(length)
:"r"(consta),"r"(constb),"r"(mask),"0"(src),"1"(data),"2"(xor),"3"(length)
:"memory");
//src = (unsigned char *) asrc;
}
while (length--) {
if (*src == d){
return (void *) src;
}
src++;
}
return (void *) NULL;
}
EXPORT_SYMBOL(memchr);
#endif
compile 成 UML 並使用命令 objdump -xd linux | grep -A12 "<memchr>"
卻沒有找到 memchr
之組合語言,使用命令 objdump -xd linux | grep memchr
,部分輸出如下:
0000000060236a09 l F .text 00000000000000bc memchr_inv
000000006001ca80 F *UND* 0000000000000000 memchr@@GLIBC_2.2.5
000000006001ca80 <memchr@plt>:
6001ca80: ff 25 72 37 55 00 jmpq *0x553772(%rip) # 605701f8 <memchr@GLIBC_2.2.5>
顯示成功移除掉原本的實作,但我的程式並沒有成功放入。推測是沒有進行編譯並放入核心,因此要需要對 Makefile
做修改。
arch/x86/lib
...
ifeq ($(CONFIG_X86_32),y)
obj-y += atomic64_32.o
lib-y += atomic64_cx8_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += string_32.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o
+ lib-y += string_64.o
lib-y += cmpxchg16b_emu.o
endif
增加此行後,按照前述反組譯仍沒有出現,我繼續尋找解法。我追蹤 string_32.o
,發現在 arch/x86/um/Makefile
中有一段如下
ifeq ($(CONFIG_X86_32),y)
obj-y += checksum_32.o syscalls_32.o
obj-$(CONFIG_ELF_CORE) += elfcore.o
subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o
subarch-y += ../kernel/sys_ia32.o
else
obj-y += syscalls_64.o vdso/
subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o
endif
在第 23 行有 ../lib/string_32.o
,其所在的 ifeq
是判斷 CPU 是否為 32 bits ,則下面的 else 必為 64 bits ,因此我在第 31 行最後面增加 ../lib/string_64.o
,再編譯一次 UML 。
首先輸入命令 objdump -xd linux | grep memchr
,擷取部分輸出如下:
0000000060236a63 l F .text 00000000000000bc memchr_inv
0000000060036cde l F .text 00000000000000aa memchr
接著 objdump -xd linux | grep -A53 "<memchr>"
0000000060036cde <memchr>:
60036cde: f3 0f 1e fa endbr64
60036ce2: 41 89 f0 mov %esi,%r8d
60036ce5: 40 f6 c7 07 test $0x7,%dil
60036ce9: 74 1d je 60036d08 <memchr+0x2a>
60036ceb: 48 85 d2 test %rdx,%rdx
60036cee: 75 07 jne 60036cf7 <memchr+0x19>
60036cf0: 31 ff xor %edi,%edi
60036cf2: e9 8d 00 00 00 jmpq 60036d84 <memchr+0xa6>
60036cf7: 48 ff ca dec %rdx
60036cfa: 44 38 07 cmp %r8b,(%rdi)
60036cfd: 0f 84 81 00 00 00 je 60036d84 <memchr+0xa6>
60036d03: 48 ff c7 inc %rdi
60036d06: eb dd jmp 60036ce5 <memchr+0x7>
60036d08: 48 83 fa 07 cmp $0x7,%rdx
60036d0c: 76 60 jbe 60036d6e <memchr+0x90>
60036d0e: 49 b9 ff fe fe fe fe movabs $0xfefefefefefefeff,%r9
60036d15: fe fe fe
60036d18: 89 f1 mov %esi,%ecx
60036d1a: 40 0f b6 f6 movzbl %sil,%esi
60036d1e: 31 c0 xor %eax,%eax
60036d20: 49 ba 80 80 80 80 80 movabs $0x8080808080808080,%r10
60036d27: 80 80 80
60036d2a: c1 e1 08 shl $0x8,%ecx
60036d2d: 0f b7 c9 movzwl %cx,%ecx
60036d30: 09 f1 or %esi,%ecx
60036d32: 48 63 c9 movslq %ecx,%rcx
60036d35: 48 89 ce mov %rcx,%rsi
60036d38: 48 c1 e6 10 shl $0x10,%rsi
60036d3c: 48 09 ce or %rcx,%rsi
60036d3f: 48 89 f1 mov %rsi,%rcx
60036d42: 48 c1 e1 20 shl $0x20,%rcx
60036d46: 48 09 f1 or %rsi,%rcx
60036d49: 31 f6 xor %esi,%esi
60036d4b: 48 8b 07 mov (%rdi),%rax
60036d4e: 48 31 c8 xor %rcx,%rax
60036d51: 4a 8d 34 08 lea (%rax,%r9,1),%rsi
60036d55: 48 f7 d0 not %rax
60036d58: 4c 21 d0 and %r10,%rax
60036d5b: 48 85 c6 test %rax,%rsi
60036d5e: 75 0e jne 60036d6e <memchr+0x90>
60036d60: 48 83 ea 08 sub $0x8,%rdx
60036d64: 48 83 c7 08 add $0x8,%rdi
60036d68: 48 83 fa 07 cmp $0x7,%rdx
60036d6c: 73 dd jae 60036d4b <memchr+0x6d>
60036d6e: 48 01 fa add %rdi,%rdx
60036d71: 48 39 fa cmp %rdi,%rdx
60036d74: 0f 84 76 ff ff ff je 60036cf0 <memchr+0x12>
60036d7a: 44 38 07 cmp %r8b,(%rdi)
60036d7d: 74 05 je 60036d84 <memchr+0xa6>
60036d7f: 48 ff c7 inc %rdi
60036d82: eb ed jmp 60036d71 <memchr+0x93>
60036d84: 48 89 f8 mov %rdi,%rax
60036d87: c3 retq
注意到 60036d4b
到 60036d6c
是我寫的組合語言,至此我的程式成功進入 UML 中。
unsigned long
或 long
的 macro執行 grep -r "sizeof(unsigned long)" *
或 grep -r "sizeof(long)" *
找到以下
arch/x86/crypto/crc32c-intel_glue.c:#define SCALE_F sizeof(unsigned long)
在尋找 macro 前,發現 kernel 中有些程式碼直接實作 unalign 而非使用 macro ,如 /lib/string.c
中的 strscpy
ssize_t strscpy(char *dest, const char *src, size_t count)
{
...
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
/*
* If src is unaligned, don't cross a page boundary,
* since we don't know if the next page is mapped.
*/
if ((long)src & (sizeof(long) - 1)) {
size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1));
if (limit < max)
max = limit;
}
#else
/* If src or dest is unaligned, don't do word-at-a-time. */
if (((long) dest | (long) src) & (sizeof(long) - 1))
max = 0;
#endif
...
}
在 include/linux/align.h
有 alignment 的 macro 。
#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0)
對比原本的 unalign
#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))
可發現實作方式基本一樣,只是多個 == 0
,因此可以直接使用。
改成如下
while (!IS_ALIGNED((long)src, sizeof(long))) {
if (!length--)
return NULL;
if (*src == d)
return (void *) src;
src++;
}
參考 建構 User-Mode Linux 的實驗環境
編譯完 linux
執行檔,進行 UML 的設定時,執行 UML.sh
,其內容如下
#!/bin/sh
./linux umid=uml0 \
root=/dev/root rootfstype=hostfs hostfs=./rootfs \
rw mem=64M init=/bin/sh quiet
出現以下錯誤
Aborted (core dumped)
於是決定直接執行 ./linux
並比較
原版
Core dump limits :
soft - 0
hard - NONE
Checking that ptrace can change system call numbers...OK
Checking syscall emulation patch for ptrace...OK
Checking advanced syscall emulation patch for ptrace...OK
Checking environment variables for a tempdir...none found
Checking if /dev/shm is on tmpfs...OK
Checking PROT_EXEC mmap in /dev/shm...OK
Adding 23236608 bytes to physical memory to account for exec-shield gap
Linux version 5.15.0 (hihihi@hihihi-VirtualBox) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #1 Sun May 8 15:57:10 CST 2022
Zone ranges:
Normal [mem 0x0000000000000000-0x0000000063628fff]
Movable zone start for each node
Early memory node ranges
node 0: [mem 0x0000000000000000-0x0000000003628fff]
Initmem setup node 0 [mem 0x0000000000000000-0x0000000003628fff]
Built 1 zonelists, mobility grouping on. Total pages: 13675
Kernel command line: root=98:0 console=tty
Dentry cache hash table entries: 8192 (order: 4, 65536 bytes, linear)
Inode-cache hash table entries: 4096 (order: 3, 32768 bytes, linear)
mem auto-init: stack:off, heap alloc:off, heap free:off
Memory: 26104K/55460K available (3379K kernel code, 925K rwdata, 1100K rodata, 144K init, 164K bss, 29356K reserved, 0K cma-reserved)
...
增加 string_64.c
版本
Core dump limits :
soft - 0
hard - NONE
Checking that ptrace can change system call numbers...OK
Checking syscall emulation patch for ptrace...OK
Checking advanced syscall emulation patch for ptrace...OK
Checking environment variables for a tempdir...none found
Checking if /dev/shm is on tmpfs...OK
Checking PROT_EXEC mmap in /dev/shm...OK
Adding 1441792 bytes to physical memory to account for exec-shield gap
Aborted (core dumped)
似乎是要印出 Linux version 5.15.0 ...
時出現 Aborted (core dumped)
,經過一番查找,發現是我自己寫的 memchr
中組合語言有問題,用原本測驗題的程式碼不會 Aborted (core dumped)
,經過測試,發現是迴圈的條件判斷出問題,即這段指令 jae 1b\n\t
,修改成 ja 1b\n\t
即沒有問題。
在 Userspace 進行正確性驗證,將測試字串放入 memchr-opt-x86
及 glibc 採用 SSE2 指令的 memchr
並比較字串內容,測試到字串長度為 10 萬皆相同。
修改後的結果如下:
5/20 加入初始版註解
// SPDX-License-Identifier: GPL-2.0
#include <linux/string.h>
#include <linux/export.h>
#include <linux/align.h>
/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))
#ifdef __HAVE_ARCH_MEMCHR
void *memchr(const void *cs, int c, size_t length)
{
const unsigned char *src = (const unsigned char *)cs;
unsigned char d = c;
while (!IS_ALIGNED((long)src, sizeof(long))) {
if (!length--)
return NULL;
if (*src == d)
return (void *)src;
src++;
}
if (length >= LBLOCKSIZE) {
unsigned long mask = d << 8 | d;
unsigned int i = 32;
long xor, data;
const long consta = 0xFEFEFEFEFEFEFEFF,
constb = 0x8080808080808080;
/*
* Create a 8-bytes mask for word-wise comparing.
* For example, a mask for 'a' is 0x6161616161616161.
*/
mask = mask << 16 | mask;
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
mask = (mask << i) | mask;
/*
* We perform word-wise comparing with following operation:
* 1. Xor the long word @src and @mask into @xor.
* 2. Add @xor with @consta.
* 3. ~@xor & @constb.
* 4. Perform & with the result of step 2 and 3.
*
* Step 1 creates a byte which is 0 in the long word if
* there is at least one target byte in it.
*
* Step 2 to Step 4 find if there is a byte with 0 in
* the long word.
*/
asm volatile("1:\n\t"
"movq (%0),%1\n\t"
"xorq %6,%1\n\t"
"lea (%1,%4), %2\n\t"
"notq %1\n\t"
"andq %5,%1\n\t"
"testq %1,%2\n\t"
"jne 2f\n\t"
"add $8,%0\n\t"
"sub $8,%3\n\t"
"cmp $7,%3\n\t"
"ja 1b\n\t"
"2:\n\t"
: "=D"(src), "=r"(xor), "=r"(data), "=r"(length)
: "r"(consta), "r"(constb), "r"(mask), "0"(src),
"1"(xor), "2"(data), "3"(length)
: "memory", "cc");
}
while (length--) {
if (*src == d)
return (void *)src;
src++;
}
return NULL;
}
EXPORT_SYMBOL(memchr);
#endif
準備核心模組時,輸入下列命令
$ make _modinst_ MODLIB=`pwd`/rootfs/lib/modules/VER ARCH=um
出現以下
make: *** No rule to make target '_modinst_'. Stop.
改成下列命令即可。
make modules_install MODLIB=`pwd`/rootfs/lib/modules/VER ARCH=um
在 Linux 核心程式碼最上層建立一個名為 tests
的目錄,對 建構 User-Mode Linux 的實驗環境 範例的 hello.c
稍作修改。
hello.c
內容:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
MODULE_LICENSE("GPL");
static int hello_init(void)
{
char str[1000];
char ch = 'x';
long time;
char *ret;
ktime_t kt;
memset(str, '0', sizeof(char)*1000);
str[998] = 'x';
str[999] = '\0';
kt = ktime_get();
ret = memchr(str, ch, strlen(str));
time = ktime_to_ns(ktime_sub(ktime_get(), kt));
printk(KERN_ALERT "Hello World! - init\n");
printk(KERN_ALERT "%s\n", ret);
printk(KERN_ALERT "%ld\n", time);
return 0;
}
static void hello_exit(void)
{
printk(KERN_ALERT "Hello World! - exit\n");
}
module_init(hello_init);
module_exit(hello_exit);
Makefile
如下
obj-m += hello.o
PWD := $(shell pwd)
KDIR := $(PWD)/..
default:
$(MAKE) -C $(KDIR) M=$(PWD) modules ARCH=um
clean:
$(MAKE) -C $(KDIR) M=$(PWD) clean ARCH=um
編譯上述核心模組並複製到 rootfs 中
$ make -C tests
$ cp tests/hello.ko rootfs/
啟動 UML 並掛載
UML:/ # insmod hello.ko
在增加 string_64.c
的 UML 輸出結果如下
Hello World! - init
x
512
原始版本如下
Hello World! - init
x
768
第三行的結果為用 ktime
得到的時間,因為是 UML ,其值不代表實際時間,但可看出有我的 memchr
的確較快,也證實程式碼確實有進入核心。
考慮以下讓程式碼更精簡的修改:
@@ -12,8 +12,7 @@
void *memchr(const void *cs, int c, size_t length)
{
- const unsigned char *src = (const unsigned char *)cs;
- unsigned char d = c;
+ const unsigned char *src = (const unsigned char *)cs, d = c;
while (!IS_ALIGNED((long)src, sizeof(long))) {
if (!length--)
return NULL;
@@ -33,9 +32,9 @@ void *memchr(const void *cs, int c, size
* For example, a mask for 'a' is 0x6161616161616161.
*/
- mask = mask << 16 | mask;
+ mask |= mask << 16;
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
- mask = (mask << i) | mask;
+ mask |= mask << i;
/*
* We perform word-wise comparing with following operation:
* 1. Xor the long word @src and @mask into @xor.
記得要用 checkpath 檢查,搭配 –strict
選項
memchr()
放入核心並重新 compile這是在虛擬機重新編譯核心,劃分的硬碟空間建議只少 50 GB,CPU Core至少分配 4 個。
安裝必要套件
$ sudo apt-get install liblz4-tool
$ sudo apt-get install libc6-dev
$ sudo apt-get install bison
$ sudo apt-get install flex
$ sudo apt-get install libelf-dev
$ sudo apt-get install libncurses5-dev openssl libssl-dev
$ sudo apt-get install xz-utils wget ca-certificates bc
$ sudo apt install build-essential libncurses-dev
$ sudo apt-get install libzstd-dev
$ sudo apt-get install zstd
下載 Linux Kernel 程式碼
$ wget https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.17.9.tar.xz
解壓縮並進入到此目錄
$ cd linux-5.17.9
複製原本的 .config
$ cp /boot/5.13.0-41-generic .config
產生 所需 .config
$ make menuconfig
選擇 load
再選擇 save
接著執行下列命令或動作避免編譯過程產生錯誤
$ scripts/config --disable SYSTEM_TRUSTED_KEYS
$ scripts/config --disable SYSTEM_REVOCATION_KEYS
並將 .config
中 CONFIG_DEBUG_INFO_BTF
改成如下:
CONFIG_DEBUG_INFO_BTF=n
開始編譯, -j 4
表示動用四個 CPU Core 加速編譯,此過程耗費時間最長。
$ sudo make -j 4
跑完會有以下輸出
Kernel: arch/x86/boot/bzImage is ready (#2)
編譯 modules
$ sudo make modules -j 4
安裝 module
$ sudo make INSTALL_MOD_STRIP=1 modules_install -j 4
安裝核心
$ sudo make install
$ sudo mkinitramfs /lib/modules/5.17.9/ -o /boot/initrd.img-5.17.9
更新 grub
$ sudo update-grub2
接著重新開機,從 UML 的安裝可知 memchr()
和開機有關,若核心有成功更新成 5.17.9
且能成功開機,則更新後的 memchr()
應該是沒有大問題。
輸入命令檢查核心版本
$ uname -r
得到以下輸出
5.17.9
核心版本成功更新。
接著一樣撰寫 kernel module 觀察 memchr-opt-x86
是否真的編譯入核心。
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
MODULE_LICENSE("GPL");
#define LBLOCKSIZE (sizeof(long))
void *memchr1(const void *cs, int c, size_t length)
{
const unsigned char *src = (const unsigned char *)cs;
unsigned char d = c;
while (!IS_ALIGNED((long)src, sizeof(long))) {
if (!length--)
return NULL;
if (*src == d)
return (void *)src;
src++;
}
if (length >= LBLOCKSIZE) {
unsigned long mask = d << 8 | d;
unsigned int i = 32;
long xor, data;
const long consta = 0xFEFEFEFEFEFEFEFF,
constb = 0x8080808080808080;
/*
* Create a 8-bytes mask for word-wise comparing.
* For example, a mask for 'a' is 0x6161616161616161.
*/
mask = mask << 16 | mask;
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
mask = (mask << i) | mask;
/*
* We perform word-wise comparing with following operation:
* 1. Xor the long word @src and @mask into @xor.
* 2. Add @xor with @consta.
* 3. ~@xor & @constb.
* 4. Perform & with the result of step 2 and 3.
*
* Step 1 creates a byte which is 0 in the long word if
* there is at least one target byte in it.
*
* Step 2 to Step 4 find if there is a byte with 0 in
* the long word.
*/
asm volatile("1:\n\t"
"movq (%0),%1\n\t"
"xorq %6,%1\n\t"
"lea (%1,%4), %2\n\t"
"notq %1\n\t"
"andq %5,%1\n\t"
"testq %1,%2\n\t"
"jne 2f\n\t"
"add $8,%0\n\t"
"sub $8,%3\n\t"
"cmp $7,%3\n\t"
"ja 1b\n\t"
"2:\n\t"
: "=D"(src), "=r"(xor), "=r"(data), "=r"(length)
: "r"(consta), "r"(constb), "r"(mask), "0"(src),
"1"(xor), "2"(data), "3"(length)
: "memory", "cc");
}
while (length--) {
if (*src == d)
return (void *)src;
src++;
}
return NULL;
}
static int hello_init(void)
{
char str[1000];
char ch = 'x';
long time,time1;
char *ret, *ret2;
ktime_t kt,kt2;
memset(str, '0', sizeof(char)*1000);
str[998] = 'x';
str[999] = '\0';
kt = ktime_get();
ret = memchr1(str, ch, strlen(str));
time = ktime_to_ns(ktime_sub(ktime_get(), kt));
printk(KERN_ALERT "Hello World! - init\n");
printk(KERN_ALERT "%s\n", ret);
printk(KERN_ALERT "%ld\n", time);
kt2 = ktime_get();
ret2 = memchr1(str, ch, strlen(str));
time1 = ktime_to_ns(ktime_sub(ktime_get(), kt2));
printk(KERN_ALERT "%s\n", ret2);
printk(KERN_ALERT "%ld\n", time1);
return 0;
}
static void hello_exit(void)
{
printk(KERN_ALERT "Hello World! - exit\n");
}
module_init(hello_init);
module_exit(hello_exit);
Makefile
obj-m += hello.o
PWD := $(shell pwd)
KDIR := /lib/modules/$(shell uname -r)/build
default:
make -C $(KDIR) M=$(PWD) modules
clean:
make -C $(KDIR) M=$(PWD) clean
執行輸入下列命令將核心模組放入核心
$ sudo insmod hello.ko
$ sudo rmmod hello
得到以下結果
[ 1003.758432] Hello World! - init
[ 1003.758435] x
[ 1003.758436] 458
[ 1003.758437] x
[ 1003.758437] 462
[ 1005.920261] Hello World! - exit
可見核心的 memchr 和 memchr-opt-x86
執行時間相近,可以說成功放入 Linux Kernel。
今天發布 5.18 版,可能要再編譯一次核心並測試
經過重複上述步驟,在 5.18 Linux kernel 成功開機,利用上述的 kernel module 的結果也相似。
第一個 commit message 內容
x86/lib: Optimize memchr()
The original assembly version of memchr() is implemented with
the byte-wise comparing technique, which does not fully use 64-bits
registers in x86-64 CPU. We use word-wide comparing so that 8
characters can be compared at the same time on x86-64 CPU. First,
we align the input and then use word-wise comparing to find the
first 64-bit word that contain the target. Secondly, we compare
every byte in the word and get the output.
We create two files to measure the performance. The first file
contains on average 10 characters ahead the target character.
The second file contains at least 1000 characters ahead the
target character. Our implementation of “memchr()” is slightly
better in the first test and nearly 4x faster than the orginal
implementation in the second test.
Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com>
Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
第二個 commit message 內容
x86/um: Use x86_64-optimized memchr
Add x86_64-optimized memchr, which is 4x faster
than the original implementation, into um.
Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com>
Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
第一個 commit 跑命令 ./scripts/checkpatch.pl -strict
出現以下訊息
CHECK: extern prototypes should be avoided in .h files
#42: FILE: arch/x86/include/asm/string_64.h:18:
+extern void *memchr(const void *cs, int c, size_t length);
WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
#59:
new file mode 100644
WARNING
的部分確定是要新增檔案,因此可以不理會。 CHECK
的部分建議在 .h
中避免 extern
prototypes ,我翻查 Linux Kernel 中其他 string.h
相關的 memchr()
prototype ,皆是 extern
prototype 。
按照老師的指示增加了 cover letter
Subject: [PATCH 0/2] x86: Optimize memchr() for x86-64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
*** BLURB HERE ***
These patch series add an optimized "memchr()" for x86-64 and
USER-MODE LINUX (UML).
There exists an assemebly implementation for x86-32. However,
for x86-64, there isn't any optimized version. We implement word-wise
comparison so that 8 characters can be compared at the same time on
x86-64 CPU. The optimized “memchr()” is nearly 4x faster than the
orginal implementation for long strings.
We test the optimized “memchr()” in UML and also recompile the 5.18
Kernel with the optimized “memchr()”. They run correctly.
In this patch we add a new file "string_64.c", which only contains
"memchr()". We can add more optimized string functions in it in the
future.
Yu-Jen Chang (2):
x86/lib: Optimize memchr()
x86/um: Use x86_64-optimized memchr
arch/x86/include/asm/string_64.h | 3 ++
arch/x86/lib/Makefile | 1 +
arch/x86/lib/string_64.c | 78 ++++++++++++++++++++++++++++++++
arch/x86/um/Makefile | 2 +-
4 files changed, 83 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/lib/string_64.c
--
2.25.1
第一份
$ ./scripts/get_maintainer.pl -f arch/x86/include/asm/string_64.h
Thomas Gleixner <tglx@linutronix.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Ingo Molnar <mingo@redhat.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Borislav Petkov <bp@alien8.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Dave Hansen <dave.hansen@linux.intel.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
x86@kernel.org (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
"H. Peter Anvin" <hpa@zytor.com> (reviewer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Kees Cook <keescook@chromium.org> (supporter:FORTIFY_SOURCE)
linux-kernel@vger.kernel.org (open list:X86 ARCHITECTURE (32-BIT AND 64-BIT))
linux-hardening@vger.kernel.org (open list:FORTIFY_SOURCE)
第二份
$ ./scripts/get_maintainer.pl -f arch/x86/um
Richard Weinberger <richard@nod.at> (maintainer:USER-MODE LINUX (UML))
Anton Ivanov <anton.ivanov@cambridgegreys.com> (maintainer:USER-MODE LINUX (UML))
Johannes Berg <johannes@sipsolutions.net> (maintainer:USER-MODE LINUX (UML))
Thomas Gleixner <tglx@linutronix.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Ingo Molnar <mingo@redhat.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Borislav Petkov <bp@alien8.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Dave Hansen <dave.hansen@linux.intel.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
x86@kernel.org (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
"H. Peter Anvin" <hpa@zytor.com> (reviewer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
linux-um@lists.infradead.org (open list:USER-MODE LINUX (UML))
linux-kernel@vger.kernel.org (open list:X86 ARCHITECTURE (32-BIT AND 64-BIT))
第二份的收件人應該是 Jeff Dike (5/28 更正), email 為 jdike@linux.intel.com
。第一份的收件人應為 Andi Kleen 。雖然他不在前述的名單上,之前用 git blame
查詢到他是相關檔案的維護者,他的 email 為 ak@linux.intel.com
。
安裝 git-email
$ sudo apt-get install git-email
設定 .gitconfig
[user]
name = arthurchang09
email = arthurchang09@gmail.com
[sendemail]
smtpEncryption = tls
smtpServer = smtp.gmail.com
smtpServerPort = 587
smtpUser = arthurchang09@gmail.com
由於我的 gmail 有設定雙重驗證,所以要透過 git 送 email 要開啟應用程式密碼,在手機開啟 Google帳戶 -> 安全性 -> 應用程式密碼 ,在此設定密碼。
接著輸入命令
git send-email --to arthurchang09@gmail.com \
--cc arthur09123456@gmail.com \
0001-x86-lib-Optimize-memchr.patch\
0002-um-Use-x86_64-optimized-memchr.patch
要輸入密碼時,輸入前面設定的密碼,抑或是在 .gitconfig
設定。
傳輸完成後在我的兩個信箱皆有收到信。
開發者有三點回復:
d = c
好像是 pointer 和值的比較,可能有問題0xFEFEFEFEFEFEFEFF
) constb (0x8080808080808080
) 的用途是什麼?jne
指令是否要改為 jnz
,後者似乎更合適。經過一番檢視,我的答案如下,目前尚未回覆給開發者:
d
的宣告方式,這應是 unsigned char
而非 unsigned char*
,一開始的 unsigned char* src
讓開發者以為 d
是 pointer
,實際上不是。(可能要指出 C 語言規格書的說明?)0x0101010101010101
,只是為了減少步驟寫成加法。因此 CF flag 不需要處理。 指令寫成 jnz
會更有助於理解,但其作用和 jne
相同,皆是檢查 Zero Flag ,會做修改。Andi Kleen 則是要求指出在 Kernel 中具體的效益,目前在研究中
另一位開發者 David Laight 也有提出幾點意見:
mask |= mask << i
%rdi
?read-write
的 register 要用 +r
,不要重複寫add
and jnz
.我經過測試,發現應該是要將 length
變成 8 的倍數,因此將 IS_ALIGN()
巨集改成 length & LBLOCKSIZE
。 if
條件判斷中將 for
迴圈去掉,將 inline assembly 做一些調整並改寫了一小部分。經過 UML 測試能成功啟動。 但對於 David Laight 最後兩點還是不知如何達成。
我去掉了 alignment 的部分,並且新增一個變數 dst
,為執行 word-wise comparison 的上界。因此在組合語言中,不必對 length
做減法,只需比較 src
和 dst
之大小關係,省下一個指令,等於每跑一次迴圈,減少執行一個指令。
// SPDX-License-Identifier: GPL-2.0
#include <linux/string.h>
#include <linux/export.h>
#include <linux/align.h>
/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))
#ifdef __HAVE_ARCH_MEMCHR
void *memchr(const void *cs, int c, size_t length)
{
const unsigned char *src = (const unsigned char *)cs;
const unsigned char *end = src + length;
if (length >= LBLOCKSIZE) {
unsigned long mask = c << 8 | c;
long xor, data;
const long consta = 0xFEFEFEFEFEFEFEFF,
constb = 0x8080808080808080;
const unsigned char *dst = (const unsigned char *)src +
(length & 0xFFFFFFFFFFFFFFF8);
/*
* Create a 8-bytes mask for word-wise comparing.
* For example, a mask for 'a' is 0x6161616161616161.
*/
mask |= mask << 16;
mask |= mask << 32;
/*
* We perform word-wise comparing with following operation:
* 1. Perform xor on the long word @src and @mask
* and put into @xor.
* 2. Add @xor with @consta.
* 3. ~@xor & @constb.
* 4. Perform & with the result of step 2 and 3.
*
* Step 1 creates a byte which is 0 in the long word if
* there is at least one target byte in it.
*
* Step 2 to Step 4 find if there is a byte with 0 in
* the long word.
*/
asm volatile("1:\n\t"
"movq (%[src]),%[xor]\n\t"
"xorq %[mask],%[xor]\n\t"
"lea (%[xor],%[const_a]), %[tmp]\n\t"
"notq %[xor]\n\t"
"andq %[const_b],%[xor]\n\t"
"testq %[xor],%[tmp]\n\t"
"jnz 2f\n\t"
"add $8,%[src]\n\t"
"cmp %[src], %[dst]\n\t"
"ja 1b\n\t"
"2:\n\t"
:
[src] "+r"(src), [xor] "+r"(xor), [tmp] "+r"(data)
: [const_a] "r"(consta), [const_b] "r"(constb),
[mask] "r"(mask), [dst] "r"(dst)
: "memory", "cc");
}
while (src <= end) {
if (*src == c)
return (void *)src;
src++;
}
return NULL;
}
EXPORT_SYMBOL(memchr);
#endif
David Laight 將我的程式碼改寫,整題而言較為簡潔易懂,沒有組合語言。也解決 target character 為負時會出錯的問題。我認文可以當最終版本
void *memchr(const void *p, int c, unsigned long length)
{
unsigned long mask, val;
const void *end = p + length;
c &= 0xff;
if (p <= end - 8) {
mask = c | c << 8;
mask |= mask << 16;
mask |= mask << 32;
for (; p <= end - 8; p += 8) {
val = *(unsigned long *)p ^ mask;
if ((val + 0xfefefefefefefeffu) & (~val & 0x8080808080808080u))
break;
}
}
for (; p < end; p++)
if (*(unsigned char *)p == c)
return p;
return NULL;
}
在程式碼中加入
printk("le %ld\n", length);
發現執行 UML 時會卡住,因此改寫成
if (length >= 100){
printk("le %d\n", length);
}
接著先執行先前編譯好的 UML 執行檔
$ ./linux
擷取部分輸出
...
Linux version 5.18.0 (hihihi@hihihi-VirtualBox) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #37 Mon Jun 6 17:46:51 CST 2022
l 162,10
...
Memory: 26132K/50688K available (3315K kernel code, 1002K rwdata, 1112K rodata, 143K init, 166K bss, 24556K reserved, 0K cma-reserved)
l 134,10
...
Console initialized on /dev/tty0
l 162,10
l 134,10
...
可看到 call memchr()
時,字串的長度並不長,大約只有不到 200 個字元。根據先前的測試,優化過的 memchr()
的效能優化並不明顯。
[ 0.000000] le 177
[ 0.000000] Command line: BOOT_IMAGE=/boot/vmlinuz-5.18.0 root=UUID=43f42c7b-5930-4dc9-b2f4-48f8220dd500 ro quiet splash
[ 0.000000] le 108
...
[ 3.620183] audit: type=1400 audit(1654591700.584:10): apparmor="STATUS" operation="profile_load" profile="unconfined" name="/usr/lib/connman/scripts/dhclient-script" pid=579 comm="apparmor_parser"
[ 3.830941] le 177
[ 3.830943] le 108
[ 3.830953] le 104
[ 3.830971] le 103
[ 3.830975] le 115
[ 3.830976] le 110
[ 3.830978] le 142
[ 3.831234] le 104
[ 3.831237] le 132
[ 3.831272] le 115
[ 3.831663] le 123
[ 3.831823] le 114
[ 3.831824] le 120
[ 3.831825] le 241
[ 3.831827] le 227
[ 3.831847] le 150
[ 3.831850] le 101
[ 3.831851] le 156
[ 3.831945] le 109
[ 3.831947] le 166
[ 3.831948] le 161
[ 3.831949] le 155
[ 3.831950] le 174
[ 3.831951] le 159
[ 3.831951] le 172
[ 3.831952] le 189
[ 3.831953] le 182
[ 3.831954] le 185
[ 4.129413] e1000: enp0s3 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX
...
[ 9.045064] 08:48:26.011645 main 6.1.14 r140239 started. Verbose level = 0
[ 9.045492] le 127
[ 9.045495] le 126
[ 9.046254] 08:48:26.012759 main vbglR3GuestCtrlDetectPeekGetCancelSupport: Supported (#1)
[ 9.186262] systemd-journald[256]: File /var/log/journal/6196ff73bdf74ff4ae61063ebf32f0c3/user-1000.journal corrupted or uncleanly shut down, renaming and replacing.
[ 9.189583] le 153
[ 10.149762] Service: Shared Clipboard
[ 15.078903] le 177
[ 15.078907] le 108
[ 15.078915] le 104
[ 15.078928] le 103
[ 15.078931] le 115
[ 15.078932] le 110
[ 15.078933] le 142
[ 15.078933] le 177
[ 15.078935] le 108
[ 15.078939] le 104
[ 15.078943] le 104
[ 15.078949] le 132
[ 15.078962] le 103
[ 15.078965] le 115
[ 15.078966] le 110
[ 15.078968] le 142
[ 15.078973] le 104
[ 15.078976] le 132
[ 15.078976] le 115
[ 15.079002] le 115
[ 15.079014] le 123
[ 15.079040] le 123
[ 15.079041] le 114
[ 15.079042] le 120
[ 15.079043] le 241
[ 15.079044] le 227
[ 15.079060] le 150
[ 15.079062] le 101
[ 15.079063] le 156
[ 15.079067] le 114
[ 15.079067] le 120
[ 15.079069] le 241
[ 15.079070] le 109
[ 15.079070] le 227
[ 15.079072] le 166
[ 15.079073] le 161
[ 15.079074] le 155
[ 15.079074] le 174
[ 15.079075] le 159
[ 15.079076] le 172
[ 15.079076] le 189
[ 15.079077] le 182
[ 15.079078] le 185
[ 15.079085] le 127
[ 15.079086] le 150
[ 15.079087] le 126
[ 15.079088] le 101
[ 15.079089] le 153
[ 15.079089] le 156
[ 15.079096] le 109
[ 15.079098] le 166
[ 15.079099] le 161
[ 15.079100] le 155
[ 15.079100] le 174
[ 15.079101] le 159
[ 15.079102] le 172
[ 15.079102] le 189
[ 15.079103] le 182
[ 15.079104] le 185
[ 15.079111] le 127
[ 15.079113] le 126
[ 15.079114] le 177
[ 15.079115] le 153
[ 15.079115] le 108
[ 15.079122] le 104
[ 15.079136] le 103
[ 15.079139] le 115
[ 15.079139] le 110
[ 15.079141] le 142
[ 15.079141] le 177
[ 15.079142] le 108
[ 15.079147] le 104
[ 15.079149] le 132
[ 15.079150] le 104
[ 15.079163] le 103
[ 15.079166] le 115
[ 15.079167] le 110
[ 15.079168] le 142
[ 15.079174] le 104
[ 15.079176] le 115
[ 15.079176] le 132
[ 15.079202] le 115
[ 15.079213] le 123
[ 15.079239] le 114
[ 15.079240] le 123
[ 15.079240] le 120
[ 15.079242] le 241
[ 15.079243] le 227
[ 15.079258] le 150
[ 15.079260] le 101
[ 15.079261] le 156
[ 15.079266] le 114
[ 15.079267] le 120
[ 15.079268] le 109
[ 15.079268] le 241
[ 15.079270] le 227
[ 15.079270] le 166
[ 15.079271] le 161
[ 15.079272] le 155
[ 15.079273] le 174
[ 15.079273] le 159
[ 15.079274] le 172
[ 15.079275] le 189
[ 15.079275] le 182
[ 15.079276] le 185
[ 15.079284] le 127
[ 15.079285] le 126
[ 15.079286] le 150
[ 15.079288] le 101
[ 15.079289] le 156
[ 15.079289] le 153
[ 15.079296] le 109
[ 15.079298] le 166
[ 15.079299] le 161
[ 15.079299] le 155
[ 15.079300] le 174
[ 15.079301] le 159
[ 15.079301] le 172
[ 15.079302] le 189
[ 15.079303] le 182
[ 15.079303] le 185
[ 15.079311] le 127
[ 15.079313] le 126
[ 15.079316] le 153
[ 75.957408] le 177
[ 75.957411] le 108
[ 75.957419] le 104
[ 75.957433] le 103
[ 75.957435] le 115
[ 75.957436] le 110
[ 75.957437] le 142
[ 75.957443] le 104
[ 75.957445] le 132
[ 75.957472] le 115
[ 75.957509] le 123
[ 75.957535] le 114
[ 75.957536] le 120
[ 75.957537] le 241
[ 75.957539] le 227
[ 75.957554] le 150
[ 75.957556] le 101
[ 75.957557] le 156
[ 75.957563] le 109
[ 75.957565] le 166
[ 75.957566] le 161
[ 75.957567] le 155
[ 75.957567] le 174
[ 75.957568] le 159
[ 75.957568] le 172
[ 75.957569] le 189
[ 75.957570] le 182
[ 75.957570] le 185
[ 75.957578] le 127
[ 75.957580] le 126
[ 75.957581] le 153
[ 75.957627] le 177
[ 75.957628] le 108
[ 75.957635] le 104
[ 75.957648] le 103
[ 75.957651] le 115
[ 75.957652] le 110
[ 75.957653] le 142
[ 75.957659] le 104
[ 75.957661] le 132
[ 75.957687] le 115
[ 75.957724] le 123
[ 75.957750] le 114
[ 75.957751] le 120
[ 75.957752] le 241
[ 75.957753] le 227
[ 75.957768] le 150
[ 75.957771] le 101
[ 75.957771] le 156
[ 75.957778] le 109
[ 75.957780] le 166
[ 75.957781] le 161
[ 75.957781] le 155
[ 75.957782] le 174
[ 75.957782] le 159
[ 75.957783] le 172
[ 75.957784] le 189
[ 75.957784] le 182
[ 75.957785] le 185
[ 75.957793] le 127
[ 75.957794] le 126
[ 75.957796] le 153
[ 75.993516] le 177
[ 75.993520] le 108
[ 75.993530] le 104
[ 75.993549] le 103
[ 75.993554] le 115
[ 75.993554] le 110
[ 75.993556] le 142
[ 75.993564] le 104
[ 75.993568] le 132
[ 75.993609] le 115
[ 75.993668] le 123
[ 75.993711] le 114
[ 75.993712] le 120
[ 75.993714] le 241
[ 75.993717] le 227
[ 75.993742] le 150
[ 75.993745] le 101
[ 75.993747] le 156
[ 75.993757] le 109
[ 75.993759] le 166
[ 75.993760] le 161
[ 75.993761] le 155
[ 75.993762] le 174
[ 75.993763] le 159
[ 75.993764] le 172
[ 75.993765] le 189
[ 75.993766] le 182
[ 75.993766] le 185
[ 75.993779] le 127
[ 75.993782] le 126
[ 75.993784] le 153
[ 75.993885] le 177
[ 75.993887] le 108
[ 75.993898] le 104
[ 75.993920] le 103
[ 75.993925] le 115
[ 75.993926] le 110
[ 75.993928] le 142
[ 75.993937] le 104
[ 75.993941] le 132
[ 75.993987] le 115
[ 75.994049] le 123
[ 75.994091] le 114
[ 75.994092] le 120
[ 75.994094] le 241
[ 75.994097] le 227
[ 75.994121] le 150
[ 75.994124] le 101
[ 75.994126] le 156
[ 75.994136] le 109
[ 75.994139] le 166
[ 75.994140] le 161
[ 75.994141] le 155
[ 75.994142] le 174
[ 75.994143] le 159
[ 75.994144] le 172
[ 75.994145] le 189
[ 75.994146] le 182
[ 75.994147] le 185
[ 75.994159] le 127
[ 75.994161] le 126
[ 75.994164] le 153
以上是將化的 memchr()
重新編u4並放入 kernel ,開機後的結果。同樣最多只有 200 多的字元。
目前 arch/x86
目錄中,針對 x86-64 調整過的實作是 memcpy, memset, memmove,但其他的 mem/str 系列函式仍採用 generic 實作。也許可查閱 arch/x86/lib/string_32.c
以確認在 Linux 核心使用的狀況。
我接著在 /lib/string.c
的 memchr
增加程式碼
void *memchr(const void *s, int c, size_t n)
{
+ if (n >= 100) {
+ printk("lib %ld\n", n);
+ }
const unsigned char *p = s;
while (n-- != 0) {
if ((unsigned char)c == *p++) {
return (void *)(p - 1);
}
}
return NULL;
}
得到的結果類似先前的測試,顯示 generic 實作似乎沒有被使用。目前看起來 memchr()
的使用狀況看我之前的觀察類似,透過 #define __HAVE_ARCH_MEMCHR
決定使用哪種 memchr()
。由於前述尋找 memchr()
皆是在虛擬機上運行。接下來會嘗試在雙系統重新編譯 kernel 並且測試使否有所改變。
嘗試繼續尋找,在 /drivers/misc/lkdtm/heap.c
第 207 行找到
if (memchr(val, 0xAB, 512) == NULL) {
pr_info("Memory appears initialized (%x, no earlier values)\n", *val);
} else {
pr_err("FAIL: Slab was not initialized\n");
pr_expected_config_param(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, "init_on_alloc");
}
這裡可以看到要搜尋 512 byte
/lib/string.c
的 memchr()
與 memchr_inv
跟老師討論後決定改成貢獻 lib/string.c
中的 memchr()
,並且和其功能相似但作用相反的涵式 memchr_inv()
進行 macro 上的整合。
memchr_inv()
功能為找出第一個和不標字元不相同之字元位址,也就是 memchr()
相反的功能。
在 /lib/string.c
的實作如下
static void *check_bytes8(const u8 *start, u8 value, unsigned int bytes)
{
while (bytes) {
if (*start != value)
return (void *)start;
start++;
bytes--;
}
return NULL;
}
void *memchr_inv(const void *start, int c, size_t bytes)
{
u8 value = c;
u64 value64;
unsigned int words, prefix;
if (bytes <= 16)
return check_bytes8(start, value, bytes);
value64 = value;
#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
value64 *= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
value64 *= 0x01010101;
value64 |= value64 << 32;
#else
value64 |= value64 << 8;
value64 |= value64 << 16;
value64 |= value64 << 32;
#endif
prefix = (unsigned long)start % 8;
if (prefix) {
u8 *r;
prefix = 8 - prefix;
r = check_bytes8(start, value, prefix);
if (r)
return r;
start += prefix;
bytes -= prefix;
}
words = bytes / 8;
while (words) {
if (*(u64 *)start != value64)
return check_bytes8(start, value, 8);
start += 8;
words--;
}
return check_bytes8(start, value, bytes % 8);
}
這裡可看到在字串長度小於 16 字元時,直接呼叫 check_bytes8()
尋找第一個不符合的字元。
接著使用一連串的 #if defined
來建立 mask 。如果是 64-bits CPU ,其 BITS_PER_LONG
會是 64 ,以及有 ARCH_HAS_FAST_MULTIPLIER
,直接用乘法建立,像我的 CPU 屬於此類。若是 32-bits 且有 ARCH_HAS_FAST_MULTIPLIER
,會先建立 32-bits mask 再用 |
建成 64-bits mask。若沒有 ARCH_HAS_FAST_MULTIPLIER
則都採用 |
建立。
值得注意的是無論是 32-bits 或是 64-bits , memchr_inv()
一律採用 64-bits 的 mask 。然而在 32-bits CPU , u64
會需要使用兩個 register ,這樣執行效能是否和採用 unsigned int
效能一樣需要近一步的測試。
使用 complier explorer 並下 -m32
查看 32 bit 和 64 bits 的差別。在 32 bits 下都會用到較多的指令,如:
value64 = value;
32 bits
movzx eax, BYTE PTR [ebp-13]
mov DWORD PTR [ebp-24], eax
mov DWORD PTR [ebp-20], 0
64 bits
movzx eax, BYTE PTR [rbp-5]
mov QWORD PTR [rbp-16], rax
接下來 prefix
變數是用來處理 alignment ,將 unalign 的字元先處理掉。
while (words) {
if (*(u64 *)start != value64)
return check_bytes8(start, value, 8);
start += 8;
words--;
}
接著 while
迴圈對長度為 8 bytes
的字串進行比較,若有不同則呼叫 check_bytes8()
尋找第一個不同的字元的位址。
最後再處理長度不到 8 bytes
的字串。
memchr
和 memchr_inv
共同處並合併兩者最明顯的共同之處為皆是要產生一個 word 長度的 mask ,因此可以寫成一個巨集,如下:
#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
#define CREATE_MASK(val) \
val*= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
#define CREATE_MASK(val) \
val *= 0x01010101; \
val |= val << 32;
#else
#define CREATE_MASK(val) \
val |= val << 8; \
val |= val << 16; \
val |= val << 32;
#endif
巨集名稱 CREATE_MASK
可能還要修改,目前先暫訂如此進行簡單測試。
由於 memchr_inv()
無論是 32-bit 或是 64-bit 皆是採用 u64
的型態進行處理,因此在 memchr()
也打算用同樣的方式,才能使用共同的巨集。
memchr()
變成如下
#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
#define CREATE_MASK(val) \
val*= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
#define CREATE_MASK(val) \
val *= 0x01010101; \
val |= val << 32;
#else
#define CREATE_MASK(val) \
val |= val << 8; \
val |= val << 16; \
val |= val << 32;
#endif
void *memchr(const void *p, int c, unsigned long length)
{
unsigned long long mask, val;
const void *end = p + length;
c &= 0xff;
if (p <= end - 8) {
mask = c;
CREATE_MASK(mask);
for (; p <= end - 8; p += 8) {
val = *(unsigned long *)p ^ mask;
if ((val + 0xfefefefefefefeffu) & (~val & 0x8080808080808080u))
break;
}
}
for (; p < end; p++)
if (*(unsigned char *)p == c)
return (void *)p;
return NULL;
}
使用以下命令編譯成 x86-32 進行測試
gcc -m32 -O2 -o a.out test.c
成功通過之前使用的正確性測試
接著在 UML 測試修改過的 /lib/string.c
能否通過 UML 開機測試。
#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
#define CREATE_CHAR_MASK(val) val *= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
#define CREATE_CHAR_MASK(val) \
val *= 0x01010101; \
val |= val << 32;
#else
#define CREATE_CHAR_MASK(val) \
val |= val << 8; \
val |= val << 16; \
val |= val << 32;
#endif
#ifndef __HAVE_ARCH_MEMCHR
/**
* memchr - Find a character in an area of memory.
* @s: The memory area
* @c: The byte to search for
* @n: The size of the area.
*
* returns the address of the first occurrence of @c, or %NULL
* if @c is not found
*/
void *memchr(const void *p, int c, unsigned long length)
{
u64 mask, val;
const void *end = p + length;
c &= 0xff;
if (p <= end - 8) {
mask = c;
CREATE_CHAR_MASK(mask);
for (; p <= end - 8; p += 8) {
val = *(unsigned long *)p ^ mask;
if ((val + 0xfefefefefefefeffu) &
(~val & 0x8080808080808080u))
break;
}
}
for (; p < end; p++)
if (*(unsigned char *)p == c)
return (void *)p;
return NULL;
}
EXPORT_SYMBOL(memchr);
#endif
void *memchr_inv(const void *start, int c, size_t bytes)
{
u8 value = c;
u64 value64;
unsigned int words, prefix;
if (bytes <= 16)
return check_bytes8(start, value, bytes);
value64 = value;
CREATE_CHAR_MASK(value64);
prefix = (unsigned long)start % 8;
if (prefix) {
u8 *r;
prefix = 8 - prefix;
r = check_bytes8(start, value, prefix);
if (r)
return r;
start += prefix;
bytes -= prefix;
}
words = bytes / 8;
while (words) {
if (*(u64 *)start != value64)
return check_bytes8(start, value, 8);
start += 8;
words--;
}
return check_bytes8(start, value, bytes % 8);
}
EXPORT_SYMBOL(memchr_inv);
依照之前重新編譯 kernel 的方式將上方修改過的 memchr()
以及 memchr_inv()
,成功重新開機。
然而,重新編譯只編譯 64-bit 的 kernel , 32-bit kernel 並沒有測試到。由於 Ubuntu 已經沒有 32 bit 的映像檔,因此我選用 Linux Mint 32-bit 來對kernel 進行重新編譯。
首先去官網下載 32-bit 版本。使用 Virtual Box 安裝時選擇 Ubuntu 32-bit 即可。
從 lscpu
可看到
CPU op-mode(s): 32-bit
對比 64-bit Ubuntu
CPU op-mode(s): 32-bit, 64-bit
這應該是 32-bit 的 Linux 沒錯。
接著重複之前的重新編譯 kernel 的步驟,並註解掉 /arch/x86/include/asm/string_32.h
中的 memchr()
,以免使用這塊的 memchr()
而非 /lib/string.c
中的版本。
另外像之前的測試一樣加入 printk("str %u\n", length)
觀察是否真的執行到 memchr()
以及字串的長度。
重新編譯好後, Linux Mint 成功重新開機,輸入命令 dmesg
也觀察到 memch()
的輸出,和 64 bit 版本差不多。至此可以確定 32-bit Linux 下 memchr()
能正確運行。
memchr()
透過git blame 可發現 mmemchr()
是 Linus Torvalds 撰寫的,而 memchr_inv
是 Akinobu Mita 所撰寫。
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 869) #ifndef __HAVE_ARCH_MEMCHR
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 870) /**
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 871) * memchr - Find a character in an area of memory.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 872) * @s: The memory area
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 873) * @c: The byte to search for
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 874) * @n: The size of the area.
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 875) *
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 876) * returns the address of the first occurrence of @c, or %NULL
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 877) * if @c is not found
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 878) */
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 879) void *memchr(const void *s, int c, size_t n)
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 880) {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 881) const unsigned char *p = s;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 882) while (n-- != 0) {
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 883) if ((unsigned char)c == *p++) {
51a0f0f658b0e (Jesper Juhl 2005-10-30 15:02:11 -0800 884) return (void *)(p - 1);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 885) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 886) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 887) return NULL;
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 888) }
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 889) EXPORT_SYMBOL(memchr);
^1da177e4c3f4 (Linus Torvalds 2005-04-16 15:20:36 -0700 890) #endif
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 912) void *memchr_inv(const void *start, int c, size_t bytes)
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 913) {
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 914) u8 value = c;
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 915) u64 value64;
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 916) unsigned int words, prefix;
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 917)
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 918) if (bytes <= 16)
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 919) return check_bytes8(start, value, bytes);
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 920)
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 921) value64 = value;
72d931046030b (Linus Torvalds 2014-09-13 11:14:53 -0700 922) #if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
3368e8fbcda53 (Andy Shevchenko 2015-11-10 14:45:14 -0800 923) value64 *= 0x0101010101010101ULL;
72d931046030b (Linus Torvalds 2014-09-13 11:14:53 -0700 924) #elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 925) value64 *= 0x01010101;
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 926) value64 |= value64 << 32;
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 927) #else
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 928) value64 |= value64 << 8;
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 929) value64 |= value64 << 16;
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 930) value64 |= value64 << 32;
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 931) #endif
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 932)
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 933) prefix = (unsigned long)start % 8;
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 934) if (prefix) {
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 935) u8 *r;
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 936)
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 937) prefix = 8 - prefix;
f43804bf5f9ae (Akinobu Mita 2012-03-23 15:02:14 -0700 938) r = check_bytes8(start, value, prefix);
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 939) if (r)
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 940) return r;
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 941) start += prefix;
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 942) bytes -= prefix;
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 943) }
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 944)
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 945) words = bytes / 8;
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 946)
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 947) while (words) {
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 948) if (*(u64 *)start != value64)
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 949) return check_bytes8(start, value, 8);
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 950) start += 8;
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 951) words--;
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 952) }
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 953)
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 954) return check_bytes8(start, value, bytes % 8);
798248206b59a (Akinobu Mita 2011-10-31 17:08:07 -0700 955) }
patch 1/2:
Subject: [PATCH 1/2] lib/string.c: Add a macro for memchr_inv()
We add a macro MEMCHR_MASK_GEN() so that both memchr_inv()
and memchr() can use it to generate a 8 bytes mask.
Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com>
Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
patch 2/2:
Subject: [PATCH 2/2] lib/string.c: Optimize memchr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The original version of memchr() is implemented with the byte-wise
comparing technique, which does not fully use 64-bits or 32-bits
registers in CPU. We use word-wide comparing so that 8 characters
can be compared at the same time on CPU. This code is base on
David Laight's implementation.
We create two files to measure the performance. The first file
contains on average 10 characters ahead the target character.
The second file contains at least 1000 characters ahead the
target character. Our implementation of “memchr()” is slightly
better in the first test and nearly 4x faster than the orginal
implementation in the second test.
Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com>
Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
製作 patch 時,出現以下 CHECK
CHECK: Macro argument reuse 'mask' - possible side-effects?
#29: FILE: lib/string.c:888:
+#define MEMCHR_MASK_GEN(mask) \
+ do { \
+ mask *= 0x01010101; \
+ mask |= mask << 32; \
+ } while (0)
CHECK: Macro argument 'mask' may be better as '(mask)' to avoid precedence issues
#29: FILE: lib/string.c:888:
+#define MEMCHR_MASK_GEN(mask) \
+ do { \
+ mask *= 0x01010101; \
+ mask |= mask << 32; \
+ } while (0)
按照指示修該產生以下訊息
CHECK: Macro argument reuse 'mask' - possible side-effects?
#83: FILE: lib/string.c:888:
+#define MEMCHR_MASK_GEN(mask) \
+ do { \
+ (mask) *= 0x01010101; \
+ (mask) |= mask << 32; \
+ } while (0)
第一種情況為優先權問題,假設有一 macro 如下:
#define ceil_div(x, y) (x + y - 1) / y
用以下方法呼叫
a = ceil_div (b & c, sizeof (int));
展開如下
a = (b & c + sizeof (int) - 1) / sizeof (int);
然而,考慮到優先程度,實際上會用以下方式運行
a = (b & (c + sizeof (int) - 1)) / sizeof (int);
而非
a = ((b & c) + sizeof (int) - 1)) / sizeof (int);
第二種情況是擔心 mask
可能是 fuction 的回傳值,在 macro 中重複使用會造成多次呼叫 function 。
然而,MEMCHR_MASK_GEN
這個 macro 只接受變數輸入,是將 mask
中的值展開成 8 bytes ,不可能牽涉到優先權以及 function 呼叫,因此我認為不須理會以上的 CHECK 。
輸入下列命令後
./scripts/get_maintainer.pl 0001-lib-string.c-Add-a-macro-for-memchr_inv.patch
Andy Shevchenko <andy@kernel.org> (reviewer:GENERIC STRING LIBRARY)
linux-kernel@vger.kernel.org (open list)
因此寄給的對象可能是 memchr_inv()
的作者 Akinobu Mita akinobu.mita@gmail.com
以及 Andy Shevchenko andy@kernel.org
根據老師的提示,有些處理器架構,如:ARM ,並沒有對 unaligned data 進行處理時會有效率問題,因此要針對這一點做處理,也是開發者們在乎的問題。
#define LBLOCKSIZE (sizeof(long))
#if BITS_PER_LONG == 64
#define DETECT_CHAR(X) \
(((X)-0x0101010101010101ULL) & ~(X)&0x8080808080808080ULL)
#define MEMCHR_MASK_GEN(mask) \
do { \
(mask) |= (mask) << 8; \
(mask) |= (mask) << 16; \
(mask) |= (mask) << 32; \
} while (0)
#else
#define DETECT_CHAR(X) (((X)-0x01010101) & ~(X)&0x80808080)
#define MEMCHR_MASK_GEN(mask) \
do { \
(mask) |= (mask) << 8; \
(mask) |= (mask) << 16; \
} while (0)
#endif
void *memchr(const void *p, int c, size_t length)
{
unsigned long mask, val;
const void *end = p + length;
c &= 0xff;
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
while ((long)p & (sizeof(long) - 1)) {
if (p >= end)
return NULL;
if (*(unsigned char *)p == c)
return (void *)p;
p++;
}
#endif
if (p <= end - LBLOCKSIZE) {
mask = c;
MEMCHR_MASK_GEN(mask);
for (; p <= end - LBLOCKSIZE; p += LBLOCKSIZE) {
val = *(unsigned long *)p ^ mask;
if (DETECT_CHAR(val))
break;
}
}
for (; p < end; p++)
if (*(unsigned char *)p == c)
return (void *)p;
return NULL;
}
EXPORT_SYMBOL(memchr);
#if BITS_PER_LONG == 64
#define MEMCHR_MASK_GEN(mask) \
do { \
(mask) |= (mask) << 8; \
(mask) |= (mask) << 16; \
(mask) |= (mask) << 32; \
} while (0)
#else
#define MEMCHR_MASK_GEN(mask) \
do { \
(mask) |= (mask) << 8; \
(mask) |= (mask) << 16; \
} while (0)
#endif
void *memchr(const void *p, int c, size_t length)
{
unsigned long mask;
unsigned char *src = (unsigned char *) p;
const void *end = p + length;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
size_t max = length;
c &= 0xff;
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
/*
* If src is unaligned, don't cross a page boundary,
* since we don't know if the next page is mapped.
*/
if ((long)p & (sizeof(long) - 1)) {
size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1));
if (limit < max)
max = limit;
}
#else
/* If src is unaligned, don't do word-at-a-time. */
if (((long) p) & (sizeof(long) - 1))
max = 0;
#endif
mask = c;
MEMCHR_MASK_GEN(mask);
for ( ; max >= sizeof(unsigned long); max -= sizeof(unsigned long), src += sizeof(unsigned long)) {
unsigned long data, result;
data = read_word_at_a_time(src);
data = data ^ mask;
if (has_zero(data, &result, &constants)) {
break;
}
}
for (; src < end; src++)
if (*src == c)
return (void *)src;
return NULL;
}