2022q1 Homework5 (quiz8)

contribute by < arthurchang09 >

第 8 週測驗題和延伸問題的討論

將組合語言引入 memchr_opt

下方組合語言可以順利完成 alignment 的部分，相當於 memchr_opt 第一個 while 迴圈，

void *func(const void *src_void, int c, size_t length){
    const unsigned char *src = (const unsigned char *) src_void;
    unsigned char d = c;
    long al;
    char test;
    asm volatile(
            "jmp align\n\t"
            "loop:\n\t"
            "length:\n\t"
            "decq %1\n\t"
            "testq %1,%1\n\t"
            "jne equal\n\t"
            "mov $0,%0\n\t"
            "jmp end\n\t"
            "equal:\n\t"
            "mov (%0),%3\n\t"
            "cmp %%al,%3\n\t"
            "je end\n\t"
            "incq %0\n\t"
            "align:\n\t"
            "mov %0,%2\n\t"
            "andq $7,%2\n\t"
            "testq %2,%2\n\t"
            "jne loop\n\t"
            : "=D" (src), "=&c" (length),"=r"(al),"=r"(test)
            : "a" (c), "0" (src), "1" (length), "2"(al),"3"(test)
            : "memory","cc");
    
    asm volatile(
            "end:\n\t"
            : "=D" (src), "=&c" (length),"=r"(al),"=r"(test)
            : "a" (c), "0" (src), "1" (length), "2"(al),"3"(test)
            : "memory","cc");
    return (void *) src;

目前已將 alignment 與 DETEC_CHAR 巨集的部分改寫成以下 inline assembly 。

void *fun(const void *src_void, int c, size_t length){
	const unsigned char *src = (const unsigned char *) src_void;
    unsigned char d = c;
    long al;
    char test;
    asm volatile(
        "jmp align\n\t"
        "loop:\n\t"
        "length:\n\t"
        "decq %1\n\t"
        "testq %1,%1\n\t"
        "jne equal\n\t"
        "mov $0,%0\n\t"
        "jmp end\n\t"
        "equal:"
        "mov (%0),%3\n\t"
        "cmp %%al,%3\n\t"
        "je end\n\t"
        "incq %0\n\t"
        "align:\n\t"
        "mov %0,%2\n\t"
        "testq $7,%2\n\t"
        "jne loop\n\t"
        : "=D" (src), "=&c" (length),"=r"(al),"=r"(test)
        : "a" (c), "0" (src), "1" (length), "2"(al),"3"(test)
        : "memory","cc");
    if (!TOO_SMALL(length)) {
        unsigned long *asrc = (unsigned long *) src;
        unsigned long mask = d << 8 | d;
        mask = mask << 16 | mask;
        for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
            mask = (mask << i) | mask;
        unsigned long xor, consta = 0xFEFEFEFEFEFEFEFF,constb = 0x8080808080808080;
        unsigned long mask4 = mask;
        asm volatile(
            "detect_loop:\n\t"
            "movq %6,%1\n\t"
            "xorq (%0),%1\n\t"
            "movq %1,%2\n\t"
            "addq %4,%1\n\t"
            "notq %2\n\t"
            "andq %5,%2\n\t"
            "andq %1,%2\n\t"
            "testq %2,%2\n\t"
            "jne break_detect\n\t"
            "sub $8,%3\n\t"
            "add $8,%0\n\t"
            "cmp $7,%3\n\t"
            "jae detect_loop\n\t"
            :"=D"(asrc),"=r"(mask),"=r"(xor),"=r"(length)
            :"r"(consta),"r"(constb),"r"(mask4),"0"(asrc),"1"(mask),"2"(xor),"3"(length)
            :"memory");
        asm volatile(
            "break_detect:\n\t"
            :"=D"(asrc),"=r"(mask),"=r"(xor),"=r"(length)
            :"r"(consta),"r"(constb),"r"(mask4),"0"(asrc),"1"(mask),"2"(xor),"3"(length)
            :"memory");
        src = (unsigned char *) asrc;
    }
    asm volatile("repne\n\t"
		"scasb\n\t"
		"je 1f\n\t"
		"mov $1,%0\n"
		"1:\tdec %0"
		: "=D" (src), "=&c" (length)
		: "a" (c), "0" (src), "1" (length)
		: "memory");
    
    asm volatile(
        "end:\n\t"
        : "=D" (src), "=&c" (length),"=r"(al),"=r"(test)
		: "a" (c), "0" (src), "1" (length), "2"(al),"3"(test)
		: "memory","cc");
    
    return (void *) src;
}

效能與正確性測試程式如下

int main(){
    char str2[1000];
    memset(str2,'0', sizeof(char)*1000);
    str[999] = '\0';
    char ch3 = '4';
    char *ret, *ret1, *ret3;
    for (int i = 0; i < 999; ++i) {
        memset(str2,'0', sizeof(char)*1000);
        str2[i] = '4';
        struct timespec t1,t2,t3,t4,t5,t6;

        ret = memchr_opt(str2,ch3,strlen(str2));

        clock_gettime(CLOCK_REALTIME, &t1);
        for (int j = 0; j < 100; ++j){
            ret = memchr_opt(str2,ch3,strlen(str2));
        }
        clock_gettime(CLOCK_REALTIME, &t2);

        clock_gettime(CLOCK_REALTIME, &t3);
        for (int j = 0; j < 100; ++j){
            ret1 = fun(str2,ch3,strlen(str2));
        }
        clock_gettime(CLOCK_REALTIME, &t4);


        clock_gettime(CLOCK_REALTIME, &t5);
        for (int j = 0; j < 100; ++j){
            ret3 = memchr(str2,ch3,strlen(str2));
        }
        clock_gettime(CLOCK_REALTIME, &t6);

        printf("%d %ld %ld %ld\n", i, (t2.tv_nsec - t1.tv_nsec) / 100, (t4.tv_nsec - t3.tv_nsec) / 100, (t6.tv_nsec - t5.tv_nsec) / 100);

        if (strlen(ret) != strlen(ret1)){
            printf("error\n");
        } else {
            for (int k = 0; k < strlen(ret); ++k){
                if (ret[k] != ret1[k]){
                    printf("error2");
                    break;
                }
            }
        }
    }
    return 0;
}

編譯

gcc -o a.out test.c

初步在虛擬機 (CPU i7-10700K) userspace 之效能如下圖

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

雖然效能不及 default-memchr ，但已比 memchr-opt 運行速度更快。

在 Linux Kernel Module 測試

預計採用 fibdrv 方式寫成 Linux character device driver，使用 copy_from_user 將字串從 userspace 複製到 kernel 放入 kmalloc 要到的記憶體空間，最後將結果使用 copy_to_user 回傳到 userspace ，並將執行時間用 return 的方式回傳。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

inline assembly 效能卻較差，試著在優化

將 alignment 的部分稍作修改，省一次 jmp 。

asm volatile(
       "testq $7,%0\n\t"
       "je end_align\n\t"
       "loop:\n\t"
       "length:\n\t"
       "decq %1\n\t"
       "testq %1,%1\n\t"
       "jne equal\n\t"
       "mov $0,%0\n\t"
       "jmp end\n\t"
       "equal:"
       "mov (%0),%2\n\t"
       "cmp %%al,%2\n\t"
       "je end\n\t"
       "incq %0\n\t"
       "align:\n\t"
       "testq $7,%0\n\t"
       "jne loop\n\t"
       "end_align:\n\t"
       : "=D" (src), "=&c" (length),"=r"(test)
       : "a" (c), "0" (src), "1" (length),"2"(test)
       : "memory","cc");

考慮到之後會在 LKML 提交圖表，標示方式應該統一:

glibc-memchr-x86: glibc 裡頭 sysdep/x86 的 memchr 實作
linux-memchr-x86: 原本 Linux 核心的 x86 memchr 實作
memchr-x86-opt: 上述針對 Linux 核心 x86 memchr 的效能改進實作

回顧 inline assmebly 程式碼，用 gcc -S test.c 觀察組合語言，發現每一段 inline assmebly 前都會將原本 register 的植存回記憶體，我的實作又有數個分段的 inline assembly ，因此最佳的方式只要一塊 inline assembly 。

.L23:
	cmpl	$63, -60(%rbp)
	jbe	.L24
	movabsq	$-72340172838076673, %rax
	movq	%rax, -32(%rbp)
	movabsq	$-9187201950435737472, %rax
	movq	%rax, -24(%rbp)
	movq	-32(%rbp), %r8
	movq	-24(%rbp), %r9
	movq	-48(%rbp), %r10
	movq	-40(%rbp), %rsi
	movq	-16(%rbp), %rcx
	movq	-8(%rbp), %rdx
	movq	-88(%rbp), %rax
	movq	%rsi, %rdi
#APP
# 180 "test.c" 1
	detect_loop:
	movq (%rdi),%rcx
	xorq %r10,%rcx
	lea (%rcx,%r8), %rdx
	notq %rcx
	andq %r9,%rcx
	testq %rcx,%rdx
	jne break_detect
	sub $8,%rax
	add $8,%rdi
	cmp $7,%rax
	jae detect_loop

經過數次對比測試，發現去掉下列部分組合語言:

 asm volatile("repne\n\t"
		"scasb\n\t"
		"je 1f\n\t"
		"mov $1,%0\n"
		"1:\tdec %0"
		: "=D" (src), "=&c" (length)
		: "a" (c), "0" (src), "1" (length)
		: "memory");

去掉開頭 alignment 的組合語言在不開啟編譯器優化下效能沒差。但先去掉以節省將 register 存回 memory 的操作。

在虛擬機中 Linux 核心之效能表現不錯。和 glibc-memchr-x86 實作比較上，效能也有些許勝出，如下圖所示：

以上測試皆在虛擬機上，轉移到實體電腦測試(CPU i5-7200U)。

參照 std::find() and memchr() Optimizations 一文，使用其效能分析的手法，對上述 memchr-x86-opt 和相關實作進行評比。
jserv

參考 std::find() and memchr() Optimizations 的效能分析

接著參考 std::find() and memchr() Optimizations ，其採用 128 KiB 的 char buffer 儲存字串。會選擇這個大小是因為會有更好的 throughput ，參見 GNU 註解，我先在虛擬機是跑裡面的 script

    for i in $(seq 0 10); do
      bs=$((1024*2**$i))
      printf "%7s=" $bs
      timeout --foreground -sINT 2 \
        dd bs=$bs if=/dev/zero of=/dev/null 2>&1 \
          | sed -n 's/.* \([0-9.]* [GM]B\/s\)/\1/p'
    done

得到以下輸出：

bash throughput.sh 
   1024=3.6 GB/s
   2048=6.5 GB/s
   4096=11.0 GB/s
   8192=16.2 GB/s
  16384=21.4 GB/s
  32768=22.1 GB/s
  65536=26.1 GB/s
 131072=28.0 GB/s
 262144=27.3 GB/s
 524288=27.1 GB/s
1048576=28.3 GB/s

筆電雙系統

   1024=916 MB/s
   2048=1.8 GB/s
   4096=3.3 GB/s
   8192=5.7 GB/s
  16384=9.0 GB/s
  32768=11.5 GB/s
  65536=15.0 GB/s
 131072=17.0 GB/s
 262144=17.4 GB/s
 524288=17.8 GB/s
1048576=18.0 GB/s

每次執行執會稍有不同，但可看出大約在

131072 = 128 \times 1024 K i B

速度就達到接近頂峰，因此選擇這個大小。

此文作者程式碼主要是測試較為破碎的字串，輸入資料擷取部分如下：








W,w,WW,WWW,WY|Y,y,A
a,AA,AAA,aah,ah|AI,air,AR
Ar,Au,aw,E,e|ea,ear,EEO
e'er,eh,EOE,ER,Er|er,EU,Eu
Eur,I,i,IA,Ia|IE,IEEE,ii
iii,Io,IOU,Ir,O|o,oar,OE
o'er,OH,oh,oi,ooh|OR,or,our
ow,U,u,UAR,UAW|ugh,uh,Ur

測試程式會用一次 find() ，和 memchr 功能相近的程式，去找每一行的 \n 的位址，再用一次 find() 找到 | 。以上方的資料為例，第一次尋找會找到 W,w,WW,WWW,WY|Y,y,A ，第二次會找到 |Y,y,A 。接著移向下一行再尋找，直到 buffer ，即暫存所有資料的 array 走到盡頭，接著會再繼續從檔案讀取下一波 131072 byte 資料。因此，估計在不開優化的情形下 linux-memchr-x86 的表現不會太差。

以下為改寫成 C 之測試程式碼








































int main()
{
    char buffer[128*1024 + 64];
    int N = 128 * 1024;
    int n = 3000;
    FILE* fd = fopen("in", "r");
    size_t r, off = 0;
    char *e, *x = buffer, *y;
    const char *b1 = buffer + N;
    for (int i = 0; i < n; ++i) {
        fseek(fd, 0, SEEK_SET);
        while (1)
        {
            off = 0;
            r = fread(buffer, sizeof(char), N, fd);
            char *b = buffer;
            for (;;) {
                x = memchr_x86(b, '\n', b1 - x);
                if (!x) {
                    break;
                }
                y = memchr_x86(b, '|', x - b);
                *x = '\0';
                off += y - b;
                if (y != x) {
                    printf("%ld\n", off);
                }
                if (x >= b1) {
                    break;
                }
                b = x + 1;
                off = 0;
            }
            if (r != N) {
                break;
            }
        }
        
    }
}

採用 command line 的 time 測量整個程式的執行時間，不開啟編譯器優化

time taskset 0x01 ./b.out

	memchr-x86-opt	linux-memchr-x86	glibc-memchr-x86
real time	11.33825	11.72425	13.076
user time	11.0325	11.406	12.726
sys time	0.2935	0.32425	0.344

由上方表格可看出在短字串下 linux-memchr-x86 表現比 glibc-memchr-x86 好

接下來測試開啟編譯器優化，由於先前 inline assembly 的 label 是採用文字，造成開啟 -O3 優化時會出現錯誤，因此在此修改 label ，改用數字表示，且在 jump instruction 後的 label 要加上 f 或 b 表示會往下方或上方的 label 進行 jump ，使編譯器 unroll 時不會發生錯誤，如下：

void *fun1(const void *src_void, int c, size_t length){
    const unsigned char *src = (const unsigned char *) src_void;
    unsigned char d = c;
    long al;
    char test;
    while (UNALIGNED(src)) {
        if (!length--)
            return NULL;
        if (*src == d)
            return (void *) src;
        src++;
    }
    if (!TOO_SMALL(length)) {
        unsigned long *asrc = (unsigned long *) src;
        unsigned long mask = d << 8 | d;
        mask = mask << 16 | mask;
        for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
            mask = (mask << i) | mask;
        unsigned long xor, consta = 0xFEFEFEFEFEFEFEFF,constb = 0x8080808080808080;
        unsigned long data;
        asm volatile(
            "1:\n\t"
            "movq (%0),%1\n\t"
            "xorq %6,%1\n\t"
            "lea (%1,%4), %2\n\t"
            "notq %1\n\t"
            "andq %5,%1\n\t"
            "testq %1,%2\n\t"
            "jne 2f\n\t"
            "sub $8,%3\n\t"
            "add $8,%0\n\t"
            "cmp $7,%3\n\t"
            "jae 1b\n\t"
            "2:\n\t"
            :"=D"(asrc),"=r"(data),"=r"(xor),"=r"(length)
            :"r"(consta),"r"(constb),"r"(mask),"0"(asrc),"1"(data),"2"(xor),"3"(length)
            :"memory");
        src = (unsigned char *) asrc;
    }
    while (length--) {
        if (*src == d){
            return (void *) src;
        }
        src++;
    }
    return (void *) NULL;
}

	memchr-x86-opt	linux-memchr-x86	glibc-memchr-x86
real time	8.34025	11.1175	8.4965
user time	8.01525	10.82	8.23925
sys time	0.312	0.291	0.257

開啟 -O3 後，三者的速度都有所加快。其中 glibc-memchr-x86 之速度加快很多，但仍較 memchr-x86-opt 慢。

接著回去看組合語言，兩者皆是將函式直接放入 int main() 中，因此 funtion call 的成本兩者應該是相近的，其中並未使用 loop unrolling 的方式優化。因此推測是 inline assmebly 的部分帶來的加速效果。因此我使用原本測驗一的程式碼再跑一次測試，並使用 perf 比較是否是組合語言達成這個效果，如下：

memchr-x86-opt(assembly)

sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null

 Performance counter stats for 'taskset 0x01 ./b.out' (10 runs):

     205,4433,7709      cycles                                                        ( +-  0.02% )  (66.63%)
     447,8938,8415      instructions              #    2.18  insn per cycle           ( +-  0.00% )  (83.32%)
          282,0384      cache-misses              #    1.288 % of all cache refs      ( +-  7.92% )  (83.33%)
       2,1897,1894      cache-references                                              ( +-  1.35% )  (83.35%)
       2,6449,1465      branch-misses                                                 ( +-  0.01% )  (83.36%)
      96,5328,5976      branch                                                        ( +-  0.00% )  (83.33%)

           8.25079 +- 0.00243 seconds time elapsed  ( +-  0.03% )

memchr-x86-opt(original)

sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null

 Performance counter stats for 'taskset 0x01 ./b.out' (10 runs):

     218,3588,7203      cycles                                                        ( +-  0.05% )  (66.66%)
     448,9144,0365      instructions              #    2.06  insn per cycle           ( +-  0.00% )  (83.34%)
          342,8490      cache-misses              #    1.737 % of all cache refs      ( +-  8.35% )  (83.34%)
       1,9738,5499      cache-references                                              ( +-  0.89% )  (83.34%)
       2,5664,0313      branch-misses                                                 ( +-  0.02% )  (83.34%)
      95,5833,8661      branch                                                        ( +-  0.00% )  (83.32%)

           8.77154 +- 0.00486 seconds time elapsed  ( +-  0.06% )

glibc-memchr-x86

sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null

 Performance counter stats for 'taskset 0x01 ./b.out' (10 runs):

     211,8621,2328      cycles                                                        ( +-  0.03% )  (66.64%)
     444,6003,3863      instructions              #    2.10  insn per cycle           ( +-  0.00% )  (83.33%)
          335,5993      cache-misses              #    1.595 % of all cache refs      ( +-  5.27% )  (83.34%)
       2,1047,1090      cache-references                                              ( +-  1.85% )  (83.34%)
       2,5254,3590      branch-misses                                                 ( +-  0.00% )  (83.34%)
      97,1084,9600      branch                                                        ( +-  0.00% )  (83.34%)

           8.50679 +- 0.00295 seconds time elapsed  ( +-  0.03% )

linux-memchr-x86

sudo perf stat -r 10 -e cycles,instructions,cache-misses,cache-references,branch-misses,branch taskset 0x01 ./b.out > /dev/null

 Performance counter stats for 'taskset 0x01 ./b.out' (10 runs):

     280,0662,7457      cycles                                                        ( +-  0.09% )  (66.65%)
     309,2942,6350      instructions              #    1.10  insn per cycle           ( +-  0.01% )  (83.33%)
         1548,9334      cache-misses              #    7.536 % of all cache refs      ( +-  3.03% )  (83.33%)
       2,0553,4812      cache-references                                              ( +-  1.52% )  (83.34%)
           72,7369      branch-misses                                                 ( +-  0.67% )  (83.34%)
      61,5698,8166      branch                                                        ( +-  0.01% )  (83.34%)

           11.2574 +- 0.0114 seconds time elapsed  ( +-  0.10% )

整理如下：

	memchr-x86-opt(assembly)	memchr-x86-opt(original)	glibc-memchr-x86	linux-memchr-x86
time(s)	8.25079 ± 0.00243	8.77154 ± 0.00486	8.50679 ± 0.00295	11.2574 ± 0.0114
instructions	447,8938,8415	448,9144,0365	444,6003,3863	309,2942,6350
cache-misses	282,0384	342,8490	335,5993	1548,9334
insn per cycle	2.18	2.06	2.10	1.10

相較於測驗一原始程式，可以看出我修改的 inline assembly 在 IPC(instruction per cycle) 以及 cache-misses 有所領先。

以上的測試主要是針對較短字串，若目標字元在整個字串相對靠後的位置，表現應該會有所變化。接下來我對輸入資料做修改，在 | 之前至少有 200 個單字，後面有 1 ~ 50 個單字，每個單字平均長度以 5 個字元計算，包括逗號，至少需比較 1199 個字元。

資料產生的方式為以下 word_gen.py ，從 /usr/share/dict/words 中隨機選取單詞組合

#!/usr/bin/env python3
import random
import string

words = open('/usr/share/dict/words').read().splitlines()
ranstr = ''
f = open('in2.txt', 'w')
for i in range(15833):
    ranstr1 = ''
    lengh = random.randint(200,400)
    for j in range(lengh):
        ranstr += words[random.randint(0,len(words) - 1)]
        if (j < lengh - 1):
            ranstr += ','
    ranstr += '|'
    lengh = random.randint(1,50)
    for j in range(lengh):
        ranstr += words[random.randint(0,len(words) - 1)]
        if (j < lengh - 1):
            ranstr += ','
    print(ranstr, file = f)
    ranstr = ''
f.close()

範例字串

dawns,gratitude's,Houdini,barnyard,OHSA's,Florence's,rehabilitates,foulest,conquistadores,divide's,insert's,ramifications

開啟 -O3 優化結果

	memchr-x86-opt	linux-memchr-x86	glibc-memchr-x86
real time	1m 5.57875	4m 20.38075	1m 16.8215
user time	45.76925	3m 57.2797	55.55025
sys time	19.73975	22.98525	21.2315

未開啟優化 :

	memchr-x86-opt	linux-memchr-x86	glibc-memchr-x86
real time	1m 30.063	4m 20.9525	2m 28.592
user time	1m 8.37275	3m 57.7035	2m 6.033
sys time	21.73575	23.12075	22.484

由以上表格可看出在不開優化下 linux-memchr-x86 在長字串的搜尋就沒有比 glibc-memchr-x86 快。

由於 time 這個命令可能會將 ELF loader 和 VMA 的成本都算進去，因此改用 struct timespec 來測量時間這 3000 次 iteration 總共花的時間。

結果如下：

開啟 -O3

	memchr-x86-opt	linux-memchr-x86	glibc-memchr-x86
時間(s)	65.403	259.202	65.284

未開啟優化

	memchr-x86-opt	linux-memchr-x86	glibc-memchr-x86
時間(s)	89.296	259.409	147.488

意外的是開啟 -O3 時 glibc-memchr-x86 這次的測試成績居然較 memchr-x86-opt 佳，大約可以快約 0.1 ~ 0.8 秒，因此我用同樣的方式重新測量了一次時間，結果 glibc-memchr-x86 時間又回到之前採用 time 命令的長度，反而 memchr-x86-opt 的執行時間沒有太大改變，維持 65 秒上下，不太清楚為何有這 10 秒的差距。

	memchr-x86-opt	linux-memchr-x86	glibc-memchr-x86
時間(s)	65.398	259.214	76.24

接著我在程式碼不同的位置放上 clock_gettime 測量不同區段的執行時間。首先，放在上方程式碼 17 行 for 迴圈前後，將每次跑迴圈的時間累加起來。接著是在 while 迴圈前後，將每次檔案處理完的時間加起來，比較這其中的差距。

逐次 for 迴圈時間總和

	memchr-x86-opt	glibc-memchr-x86
時間(s)	45.5391	44.93

逐次 while 迴圈時間總和

	memchr-x86-opt	glibc-memchr-x86
時間(s)	69.74045	76.209

明顯觀察到 for 迴圈的執行時間 glibc-memchr-x86 較快，而 while 迴圈中則變慢，且 while 迴圈的測量數據中， memchr-x86-opt 竟然較測量 3000 次 iteration 長，明顯測量上有問題。

接著我測量跑完單一檔案所花的時間，也就是 while 迴圈前後的位置放上 clock_gettime ，但只執行一次就跳出最外層的 for 迴圈，結果如下:

	memchr-x86-opt	glibc-memchr-x86
時間(s)	0.02173	0.02484

上方表格的數字乘 3000 就和使用 time 以及測量 3000 iteration 的時間相近，

重新設定 isolcpus 在進行效能測試

跟老師討論後，認為時間的差距有可能是因為 CPU0 存在一些特定的裝置驅動或 kthread，造成測量的時間有誤，因此重新限定 CPU 給特定的程式使用。我的 CPU 為 2 核 4 緒，因此將 CPU2 和 CPU 3 獨立出來。
進入 /etc/default/grub

sudo vim /etc/default/grub

看到以下這行

GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"

改成

GRUB_CMDLINE_LINUX_DEFAULT="quiet splash isolcpus=2,3"

關閉 Hyper Thread

在筆電的 BIOS 翻了一圈並沒有相關選項，因此無法關閉。推測可能是因為筆電的 BIOS 限制較嚴格，加上我的筆電 CPU 其實只有兩個實體核，故無法關閉。

結果

為了去除可能的誤差，我按照老師的提示，運用統計手法去除極端值，並開啟 -O2 優化。

首先，測量一次檔案的時間，總共取 1000 次，並且取 95% 信賴區間。

	memchr-x86-opt	linux-memchr-x86	glibc-memchr-x86
時間範圍(s)	0.02190 ~ 0.02201	0.08536 ~ 0.08713	0.02256 ~0.02275

可看出在 -O2 優化下， memchr-x86-opt 和 glibc-memchr-x86 效能是相差無幾，兩者的核心概念是相同，只是實作方式稍有不同。我所新增的組合語言也是基於 -O3 的組合語言改寫，因此效能差不多是在意料之中。

相較於 linux-memchr-x86 ， memchr-x86-opt 的執行時間是前者的

\frac{1}{4}

，可見 SWAR 、 word-wide comparing 是十分有效的方法。

以 UML/x86-64 觀察 memchr 的機械碼

UML 之核心版本為 5.15，memchr 反組譯的結果如下:

$ objdump -xd linux | grep -A12 "<memchr>"
000000006020a91e <memchr>:
    6020a91e:	f3 0f 1e fa          	endbr64 
    6020a922:	48 89 f8             	mov    %rdi,%rax
    6020a925:	48 01 fa             	add    %rdi,%rdx
    6020a928:	48 39 d0             	cmp    %rdx,%rax
    6020a92b:	74 0e                	je     6020a93b <memchr+0x1d>
    6020a92d:	48 8d 48 01          	lea    0x1(%rax),%rcx
    6020a931:	40 38 30             	cmp    %sil,(%rax)
    6020a934:	74 07                	je     6020a93d <memchr+0x1f>
    6020a936:	48 89 c8             	mov    %rcx,%rax
    6020a939:	eb ed                	jmp    6020a928 <memchr+0xa>
    6020a93b:	31 c0                	xor    %eax,%eax
    6020a93d:	c3                   	retq

簡單觀察組合語言， lea 將資料讀入 register ， cmp 將讀入的資料和字元做比較，其中 %sil 是一個 8-bits register ，若相同，則 Zero Flag 設起，會跳到 req 回傳值。因此這部分的組合語言為 byte-wise comparing 。

memchr-opt-x86 之組合語言微調

以下組合語言輸出命令如下
gcc -S asm.c
gcc -S -O2 asm.c

並使用 diff 命令比較組合語言差異

移除 asrc 然後檢查組合輸出
不開啟優化時，組合語言是有些微區別。比較後發現主要的差別是進行 function call 時，從記憶體取值的位址有差異。最大不同處位於 if (!TOO_SMALL(length)) 中的部分，多了

    movq	-56(%rbp), %rax
    movq	%rax, -40(%rbp)

也就是 unsigned long *asrc = (unsigned long *) src; 這段程式碼的組合語言。

但開啟 -O2 優化後，這部分被優化掉，有沒有上述這段程式碼不影響組合語言。

試著 consta 和 consta 宣告時加 const
不開啟優化時，兩者組合語言沒有差別，開啟 -O2 優化時也沒有差異。

對比 memchr 之前有哪些開發者變更

我直接 git clone Linux 在 Github 上的專案。

git blame arch/x86/lib/string_32.c

b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 180) #ifdef __HAVE_ARCH_MEMCHR
8cf36d2bc5832 arch/x86/lib/string_32.c (Paolo Ciarrocchi   2008-02-19 23:09:59 +0100 181) void *memchr(const void *cs, int c, size_t count)
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 182) {
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 183)       int d0;
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 184)       void *res;
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 185)       if (!count)
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 186)               return NULL;
8cf36d2bc5832 arch/x86/lib/string_32.c (Paolo Ciarrocchi   2008-02-19 23:09:59 +0100 187)       asm volatile("repne\n\t"
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 188)               "scasb\n\t"
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 189)               "je 1f\n\t"
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 190)               "movl $1,%0\n"
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 191)               "1:\tdecl %0"
3492cdf0176bd arch/x86/lib/string_32.c (Paolo Ciarrocchi   2008-08-02 21:25:13 +0200 192)               : "=D" (res), "=&c" (d0)
3492cdf0176bd arch/x86/lib/string_32.c (Paolo Ciarrocchi   2008-08-02 21:25:13 +0200 193)               : "a" (c), "0" (cs), "1" (count)
3492cdf0176bd arch/x86/lib/string_32.c (Paolo Ciarrocchi   2008-08-02 21:25:13 +0200 194)               : "memory");
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 195)       return res;
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 196) }
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 197) EXPORT_SYMBOL(memchr);
b520b85a963bf arch/i386/lib/string.c   (Andi Kleen         2007-07-21 17:09:59 +0200 198) #endif

memchr 基本上由 Andi Kleen 和 Paolo Ciarrocchi 變更 memchr 這段程式碼，主要內容還是由 Andi Kleen 撰寫。

memchr 的改進描述，可參見 arm64: Better optimised memchr()

Although we implement our own assembly version of memchr(), it turns out to be barely any better than what GCC can generate for the generic C version (and would go wrong if the size_t argument were ever large enough to be interpreted as negative). Unfortunately we can't import the tuned implementation from the Arm optimized-routines library, since that has some Advanced SIMD parts which are not really viable for general kernel library code. What we can do, however, is pep things up with some relatively straightforward word-at-a-time logic for larger calls.

Adding some timing to optimized-routines' memchr() test for a simple benchmark, overall this version comes in around half as fast as the SIMD code, but still nearly 4x faster than our existing implementation.

commit 9e51cafd783b22018fb15bfb06d65f69349223a9

撰寫 git commit message (草稿)

思考中…

The original assembly version of memchr() is implemented with the byte-wise comparing technic, which is not fully using whole 64-bits registers in x86-64 CPU. We use word-wide comparing so that 8 characters can be compared at the same time on x86-64 CPU. First we align the input and then use word-wise comparing to find the first 64-bit word that content the target. Secondly, we compare every byte in the word and get the output.

We create two files to measure the performance. The first file contains on average 10 characters ahead the target character. The second file contains at least 1000 characters ahead the target character. It turns out that "memchr()" is slightly better in the first test and nearly 4x faster than the orginal implementation in the second test.

string.h 的結構以及 `memchr-opt-x86` 可能的添加方式

以下節錄 /include/linux/string.h 以及 /lib/string.c

#ifndef __HAVE_ARCH_MEMCHR
extern void * memchr(const void *,int,__kernel_size_t);
#endif

#ifndef __HAVE_ARCH_MEMCHR
/**
 * memchr - Find a character in an area of memory.
 * @s: The memory area
 * @c: The byte to search for
 * @n: The size of the area.
 *
 * returns the address of the first occurrence of @c, or %NULL
 * if @c is not found
 */
void *memchr(const void *s, int c, size_t n)
{
	const unsigned char *p = s;
	while (n-- != 0) {
        	if ((unsigned char)c == *p++) {
			return (void *)(p - 1);
		}
	}
	return NULL;
}
EXPORT_SYMBOL(memchr);
#endif

可看到若無 def __HAVE_ARCH_MEMCHR ，則會使用 string.h 中的 memchr。
追溯 __HAVE_ARCH_MEMCHR ，發現在 arch 目錄下各個架構 CPU ，如 arm 、 arm64 、 powerpc 、 x86 等的 string.h 都有，其中 x86 的部分只有在 string_32.h 、 string_32.c 中有 define。

接著觀察 x86 這兩個檔案，擷取如下：

#define __HAVE_ARCH_MEMCHR
extern void *memchr(const void *cs, int c, size_t count);

#ifdef __HAVE_ARCH_MEMCHR
void *memchr(const void *cs, int c, size_t count)
{
	int d0;
	void *res;
	if (!count)
		return NULL;
	asm volatile("repne\n\t"
		"scasb\n\t"
		"je 1f\n\t"
		"movl $1,%0\n"
		"1:\tdecl %0"
		: "=D" (res), "=&c" (d0)
		: "a" (c), "0" (cs), "1" (count)
		: "memory");
	return res;
}
EXPORT_SYMBOL(memchr);
#endif

string_32.h 中 define __HAVE_ARCH_MEMCHR ，並在 string_32.c 中判斷是否已 define ，並且實作程式碼。

接著再看 string_64.h ，其內容也類似上述描述，都是 #define __HAVE_ARCH_XXXX 接著 function 的宣告，只是沒有統一的 string_64.c ，而是各個 .s file 的組合語言檔案。因此，memchr-opt-x86 要加入且暫時不管 x86-32 的話，應該是在 string_64.h 做出類似前述的 #defin ，並且在 /arch/x86/lib 建立一個對應的 .c 檔。

之後會嘗試在 UML 上修改並測試。

嘗試在 string_64.h 增加 memchr

增加下面這兩行

#define __HAVE_ARCH_MEMCHR
extern void *memchr(const void *cs, int c, size_t count);

並將我的 memchr-opt-x86 新增成 string_64.c，如下:

#define __NO_FORTIFY
#include <linux/string.h>
#include <linux/export.h>

/* Nonzero if either X or Y is not aligned on a "long" boundary */
#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))

/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))

/* Threshhold for punting to the bytewise iterator */
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)

#ifdef __HAVE_ARCH_MEMCHR

void *memchr(const void *src_void, int c, size_t length){
    const unsigned char *src = (const unsigned char *) src_void;
    unsigned char d = c;
    while (UNALIGNED(src)) {
        if (!length--)
            return NULL;
        if (*src == d)
            return (void *) src;
        src++;
    }
    if (!TOO_SMALL(length)) {
        //unsigned long *asrc = (unsigned long *) src;
        unsigned long mask = d << 8 | d;
        unsigned int i = 32;
        mask = mask << 16 | mask;
        for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
            mask = (mask << i) | mask;
        unsigned long xor, consta = 0xFEFEFEFEFEFEFEFF,constb = 0x8080808080808080;
        unsigned long data;
        asm volatile(
            "1:\n\t"
            "movq (%0),%1\n\t"
            "xorq %6,%1\n\t"
            "lea (%1,%4), %2\n\t"
            "notq %1\n\t"
            "andq %5,%1\n\t"
            "testq %1,%2\n\t"
            "jne 2f\n\t"
            "sub $8,%3\n\t"
            "add $8,%0\n\t"
            "cmp $7,%3\n\t"
            "jae 1b\n\t"
            "2:\n\t"
            :"=D"(src),"=r"(data),"=r"(xor),"=r"(length)
            :"r"(consta),"r"(constb),"r"(mask),"0"(src),"1"(data),"2"(xor),"3"(length)
            :"memory");
        //src = (unsigned char *) asrc;
    }
    while (length--) {
        if (*src == d){
            return (void *) src;
        }
        src++;
    }
    return (void *) NULL;
}

EXPORT_SYMBOL(memchr);
#endif

compile 成 UML 並使用命令 objdump -xd linux | grep -A12 "<memchr>" 卻沒有找到 memchr 之組合語言，使用命令 objdump -xd linux | grep memchr ，部分輸出如下:

0000000060236a09 l     F .text	00000000000000bc              memchr_inv
000000006001ca80       F *UND*	0000000000000000              memchr@@GLIBC_2.2.5
000000006001ca80 <memchr@plt>:
    6001ca80:	ff 25 72 37 55 00    	jmpq   *0x553772(%rip)        # 605701f8 <memchr@GLIBC_2.2.5>

顯示成功移除掉原本的實作，但我的程式並沒有成功放入。推測是沒有進行編譯並放入核心，因此要需要對 Makefile 做修改。
arch/x86/lib

...

ifeq ($(CONFIG_X86_32),y)
        obj-y += atomic64_32.o
        lib-y += atomic64_cx8_32.o
        lib-y += checksum_32.o
        lib-y += strstr_32.o
        lib-y += string_32.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
        lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
        lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
        obj-y += iomap_copy_64.o
        lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
        lib-y += clear_page_64.o copy_page_64.o
        lib-y += memmove_64.o memset_64.o
        lib-y += copy_user_64.o
+       lib-y += string_64.o
        lib-y += cmpxchg16b_emu.o
endif

增加此行後，按照前述反組譯仍沒有出現，我繼續尋找解法。我追蹤 string_32.o ，發現在 arch/x86/um/Makefile 中有一段如下
















ifeq ($(CONFIG_X86_32),y)

obj-y += checksum_32.o syscalls_32.o
obj-$(CONFIG_ELF_CORE) += elfcore.o

subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o
subarch-y += ../kernel/sys_ia32.o

else

obj-y += syscalls_64.o vdso/

subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o

endif

在第 23 行有 ../lib/string_32.o ，其所在的 ifeq 是判斷 CPU 是否為 32 bits ，則下面的 else 必為 64 bits ，因此我在第 31 行最後面增加 ../lib/string_64.o ，再編譯一次 UML 。
首先輸入命令 objdump -xd linux | grep memchr，擷取部分輸出如下:

0000000060236a63 l     F .text	00000000000000bc              memchr_inv
0000000060036cde l     F .text	00000000000000aa              memchr

接著 objdump -xd linux | grep -A53 "<memchr>"

0000000060036cde <memchr>:
    60036cde:	f3 0f 1e fa          	endbr64 
    60036ce2:	41 89 f0             	mov    %esi,%r8d
    60036ce5:	40 f6 c7 07          	test   $0x7,%dil
    60036ce9:	74 1d                	je     60036d08 <memchr+0x2a>
    60036ceb:	48 85 d2             	test   %rdx,%rdx
    60036cee:	75 07                	jne    60036cf7 <memchr+0x19>
    60036cf0:	31 ff                	xor    %edi,%edi
    60036cf2:	e9 8d 00 00 00       	jmpq   60036d84 <memchr+0xa6>
    60036cf7:	48 ff ca             	dec    %rdx
    60036cfa:	44 38 07             	cmp    %r8b,(%rdi)
    60036cfd:	0f 84 81 00 00 00    	je     60036d84 <memchr+0xa6>
    60036d03:	48 ff c7             	inc    %rdi
    60036d06:	eb dd                	jmp    60036ce5 <memchr+0x7>
    60036d08:	48 83 fa 07          	cmp    $0x7,%rdx
    60036d0c:	76 60                	jbe    60036d6e <memchr+0x90>
    60036d0e:	49 b9 ff fe fe fe fe 	movabs $0xfefefefefefefeff,%r9
    60036d15:	fe fe fe 
    60036d18:	89 f1                	mov    %esi,%ecx
    60036d1a:	40 0f b6 f6          	movzbl %sil,%esi
    60036d1e:	31 c0                	xor    %eax,%eax
    60036d20:	49 ba 80 80 80 80 80 	movabs $0x8080808080808080,%r10
    60036d27:	80 80 80 
    60036d2a:	c1 e1 08             	shl    $0x8,%ecx
    60036d2d:	0f b7 c9             	movzwl %cx,%ecx
    60036d30:	09 f1                	or     %esi,%ecx
    60036d32:	48 63 c9             	movslq %ecx,%rcx
    60036d35:	48 89 ce             	mov    %rcx,%rsi
    60036d38:	48 c1 e6 10          	shl    $0x10,%rsi
    60036d3c:	48 09 ce             	or     %rcx,%rsi
    60036d3f:	48 89 f1             	mov    %rsi,%rcx
    60036d42:	48 c1 e1 20          	shl    $0x20,%rcx
    60036d46:	48 09 f1             	or     %rsi,%rcx
    60036d49:	31 f6                	xor    %esi,%esi
    60036d4b:	48 8b 07             	mov    (%rdi),%rax
    60036d4e:	48 31 c8             	xor    %rcx,%rax
    60036d51:	4a 8d 34 08          	lea    (%rax,%r9,1),%rsi
    60036d55:	48 f7 d0             	not    %rax
    60036d58:	4c 21 d0             	and    %r10,%rax
    60036d5b:	48 85 c6             	test   %rax,%rsi
    60036d5e:	75 0e                	jne    60036d6e <memchr+0x90>
    60036d60:	48 83 ea 08          	sub    $0x8,%rdx
    60036d64:	48 83 c7 08          	add    $0x8,%rdi
    60036d68:	48 83 fa 07          	cmp    $0x7,%rdx
    60036d6c:	73 dd                	jae    60036d4b <memchr+0x6d>
    60036d6e:	48 01 fa             	add    %rdi,%rdx
    60036d71:	48 39 fa             	cmp    %rdi,%rdx
    60036d74:	0f 84 76 ff ff ff    	je     60036cf0 <memchr+0x12>
    60036d7a:	44 38 07             	cmp    %r8b,(%rdi)
    60036d7d:	74 05                	je     60036d84 <memchr+0xa6>
    60036d7f:	48 ff c7             	inc    %rdi
    60036d82:	eb ed                	jmp    60036d71 <memchr+0x93>
    60036d84:	48 89 f8             	mov    %rdi,%rax
    60036d87:	c3                   	retq

注意到 60036d4b 到 60036d6c 是我寫的組合語言，至此我的程式成功進入 UML 中。

`unsigned long` 或 `long` 的 macro

執行 grep -r "sizeof(unsigned long)" * 或 grep -r "sizeof(long)" *

找到以下

arch/x86/crypto/crc32c-intel_glue.c:#define SCALE_F	sizeof(unsigned long)

Linux 核心之 unaligned / align macro

在尋找 macro 前，發現 kernel 中有些程式碼直接實作 unalign 而非使用 macro ，如 /lib/string.c 中的 strscpy

ssize_t strscpy(char *dest, const char *src, size_t count)
{
	...

#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
	/*
	 * If src is unaligned, don't cross a page boundary,
	 * since we don't know if the next page is mapped.
	 */
	if ((long)src & (sizeof(long) - 1)) {
		size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1));
		if (limit < max)
			max = limit;
	}
#else
	/* If src or dest is unaligned, don't do word-at-a-time. */
	if (((long) dest | (long) src) & (sizeof(long) - 1))
		max = 0;
#endif

	...
}

在 include/linux/align.h 有 alignment 的 macro 。

#define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)

對比原本的 unalign

#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))

可發現實作方式基本一樣，只是多個 == 0 ，因此可以直接使用。

改成如下

    while (!IS_ALIGNED((long)src, sizeof(long))) {
        if (!length--)
            return NULL;
        if (*src == d)
            return (void *) src;
        src++;
    }

在 UML 測試 memchr

參考建構 User-Mode Linux 的實驗環境
編譯完 linux 執行檔，進行 UML 的設定時，執行 UML.sh，其內容如下

#!/bin/sh
./linux umid=uml0 \
        root=/dev/root rootfstype=hostfs hostfs=./rootfs \
        rw mem=64M init=/bin/sh quiet

出現以下錯誤

Aborted (core dumped)

於是決定直接執行 ./linux 並比較

原版

Core dump limits :
	soft - 0
	hard - NONE
Checking that ptrace can change system call numbers...OK
Checking syscall emulation patch for ptrace...OK
Checking advanced syscall emulation patch for ptrace...OK
Checking environment variables for a tempdir...none found
Checking if /dev/shm is on tmpfs...OK
Checking PROT_EXEC mmap in /dev/shm...OK
Adding 23236608 bytes to physical memory to account for exec-shield gap
Linux version 5.15.0 (hihihi@hihihi-VirtualBox) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #1 Sun May 8 15:57:10 CST 2022
Zone ranges:
  Normal   [mem 0x0000000000000000-0x0000000063628fff]
Movable zone start for each node
Early memory node ranges
  node   0: [mem 0x0000000000000000-0x0000000003628fff]
Initmem setup node 0 [mem 0x0000000000000000-0x0000000003628fff]
Built 1 zonelists, mobility grouping on.  Total pages: 13675
Kernel command line: root=98:0 console=tty
Dentry cache hash table entries: 8192 (order: 4, 65536 bytes, linear)
Inode-cache hash table entries: 4096 (order: 3, 32768 bytes, linear)
mem auto-init: stack:off, heap alloc:off, heap free:off
Memory: 26104K/55460K available (3379K kernel code, 925K rwdata, 1100K rodata, 144K init, 164K bss, 29356K reserved, 0K cma-reserved)

...

增加 string_64.c 版本

Core dump limits :
	soft - 0
	hard - NONE
Checking that ptrace can change system call numbers...OK
Checking syscall emulation patch for ptrace...OK
Checking advanced syscall emulation patch for ptrace...OK
Checking environment variables for a tempdir...none found
Checking if /dev/shm is on tmpfs...OK
Checking PROT_EXEC mmap in /dev/shm...OK
Adding 1441792 bytes to physical memory to account for exec-shield gap
Aborted (core dumped)

似乎是要印出 Linux version 5.15.0 ... 時出現 Aborted (core dumped) ，經過一番查找，發現是我自己寫的 memchr 中組合語言有問題，用原本測驗題的程式碼不會 Aborted (core dumped)，經過測試，發現是迴圈的條件判斷出問題，即這段指令 jae 1b\n\t ，修改成 ja 1b\n\t 即沒有問題。

在 Userspace 進行正確性驗證，將測試字串放入 memchr-opt-x86 及 glibc 採用 SSE2 指令的 memchr 並比較字串內容，測試到字串長度為 10 萬皆相同。

修改後的結果如下:
5/20 加入初始版註解

// SPDX-License-Identifier: GPL-2.0
#include <linux/string.h>
#include <linux/export.h>
#include <linux/align.h>

/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))

#ifdef __HAVE_ARCH_MEMCHR

void *memchr(const void *cs, int c, size_t length)
{
	const unsigned char *src = (const unsigned char *)cs;
	unsigned char d = c;
	while (!IS_ALIGNED((long)src, sizeof(long))) {
		if (!length--)
			return NULL;
		if (*src == d)
			return (void *)src;
		src++;
	}
	if (length >= LBLOCKSIZE) {
		unsigned long mask = d << 8 | d;
		unsigned int i = 32;
		long xor, data;
		const long consta = 0xFEFEFEFEFEFEFEFF,
			   constb = 0x8080808080808080;

		/*
		 * Create a 8-bytes mask for word-wise comparing.
		 * For example, a mask for 'a' is 0x6161616161616161.
		 */

		mask = mask << 16 | mask;
		for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
			mask = (mask << i) | mask;
		/*
		 * We perform word-wise comparing with following operation:
		 * 	1. Xor the long word @src and @mask into @xor.
		 * 	2. Add @xor with @consta.
		 * 	3. ~@xor & @constb.
		 * 	4. Perform & with the result of step 2 and 3.
		 *
		 * Step 1 creates a byte which is 0 in the long word if 
		 * there is at least one target byte in it.
		 *
		 * Step 2 to Step 4 find if there is a byte with 0 in
		 * the long word.  
		 */
		asm volatile("1:\n\t"
			     "movq (%0),%1\n\t"
			     "xorq %6,%1\n\t"
			     "lea (%1,%4), %2\n\t"
			     "notq %1\n\t"
			     "andq %5,%1\n\t"
			     "testq %1,%2\n\t"
			     "jne 2f\n\t"
			     "add $8,%0\n\t"
			     "sub $8,%3\n\t"
			     "cmp $7,%3\n\t"
			     "ja 1b\n\t"
			     "2:\n\t"
			     : "=D"(src), "=r"(xor), "=r"(data), "=r"(length)
			     : "r"(consta), "r"(constb), "r"(mask), "0"(src),
			       "1"(xor), "2"(data), "3"(length)
			     : "memory", "cc");
	}

	while (length--) {
		if (*src == d)
			return (void *)src;
		src++;
	}
	return NULL;
}

EXPORT_SYMBOL(memchr);
#endif

準備核心模組時，輸入下列命令

$ make _modinst_ MODLIB=`pwd`/rootfs/lib/modules/VER ARCH=um

出現以下

make: *** No rule to make target '_modinst_'.  Stop.

改成下列命令即可。

make modules_install MODLIB=`pwd`/rootfs/lib/modules/VER ARCH=um

在 Linux 核心程式碼最上層建立一個名為 tests 的目錄，對建構 User-Mode Linux 的實驗環境範例的 hello.c 稍作修改。
hello.c 內容:

#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>

MODULE_LICENSE("GPL");

static int hello_init(void)
{
    char str[1000];
    char ch = 'x';
    long time;
    char *ret;
    ktime_t kt;
    memset(str, '0', sizeof(char)*1000);
    str[998] = 'x';
    str[999] = '\0';
    kt = ktime_get();
    ret = memchr(str, ch, strlen(str));
    time = ktime_to_ns(ktime_sub(ktime_get(), kt));
    printk(KERN_ALERT "Hello World! - init\n");
    printk(KERN_ALERT "%s\n", ret);
    printk(KERN_ALERT "%ld\n", time);
    return 0;
}

static void hello_exit(void)
{
    printk(KERN_ALERT "Hello World! - exit\n");
}

module_init(hello_init);
module_exit(hello_exit);

Makefile 如下

obj-m += hello.o

PWD := $(shell pwd)
KDIR := $(PWD)/..

default:
    $(MAKE) -C $(KDIR) M=$(PWD) modules ARCH=um

clean:
    $(MAKE) -C $(KDIR) M=$(PWD) clean ARCH=um

編譯上述核心模組並複製到 rootfs 中

$ make -C tests
$ cp tests/hello.ko rootfs/

啟動 UML 並掛載

UML:/ # insmod hello.ko

在增加 string_64.c 的 UML 輸出結果如下

Hello World! - init
x
512

原始版本如下

Hello World! - init
x
768

第三行的結果為用 ktime 得到的時間，因為是 UML ，其值不代表實際時間，但可看出有我的 memchr 的確較快，也證實程式碼確實有進入核心。

考慮以下讓程式碼更精簡的修改:

@@ -12,8 +12,7 @@
 
 void *memchr(const void *cs, int c, size_t length)
 {
-	const unsigned char *src = (const unsigned char *)cs;
-	unsigned char d = c;
+	const unsigned char *src = (const unsigned char *)cs, d = c;
 	while (!IS_ALIGNED((long)src, sizeof(long))) {
 		if (!length--)
 			return NULL;
@@ -33,9 +32,9 @@ void *memchr(const void *cs, int c, size
 		 * For example, a mask for 'a' is 0x6161616161616161.
 		 */
 
-		mask = mask << 16 | mask;
+		mask |= mask << 16;
 		for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
-			mask = (mask << i) | mask;
+			mask |= mask << i;
 		/*
 		 * We perform word-wise comparing with following operation:
 		 * 	1. Xor the long word @src and @mask into @xor.

記得要用 checkpath 檢查，搭配 –strict 選項
jserv

將 `memchr()` 放入核心並重新 compile

這是在虛擬機重新編譯核心，劃分的硬碟空間建議只少 50 GB，CPU Core至少分配 4 個。

安裝必要套件

$ sudo apt-get install liblz4-tool
$ sudo apt-get install libc6-dev
$ sudo apt-get install bison
$ sudo apt-get install flex
$ sudo apt-get install libelf-dev
$ sudo apt-get install libncurses5-dev openssl libssl-dev 
$ sudo apt-get install xz-utils wget ca-certificates bc
$ sudo apt install build-essential libncurses-dev
$ sudo apt-get install libzstd-dev
$ sudo apt-get install zstd

下載 Linux Kernel 程式碼

$ wget https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.17.9.tar.xz

解壓縮並進入到此目錄

$ cd linux-5.17.9

複製原本的 .config

$ cp /boot/5.13.0-41-generic .config

產生所需 .config

$ make menuconfig

選擇 load 再選擇 save

接著執行下列命令或動作避免編譯過程產生錯誤

$ scripts/config --disable SYSTEM_TRUSTED_KEYS
$ scripts/config --disable SYSTEM_REVOCATION_KEYS

並將 .config 中 CONFIG_DEBUG_INFO_BTF 改成如下:

CONFIG_DEBUG_INFO_BTF=n

開始編譯， -j 4 表示動用四個 CPU Core 加速編譯，此過程耗費時間最長。

$ sudo make -j 4

跑完會有以下輸出

Kernel: arch/x86/boot/bzImage is ready  (#2)

編譯 modules

$ sudo make modules -j 4

安裝 module

$ sudo make INSTALL_MOD_STRIP=1 modules_install -j 4

安裝核心

$ sudo make install
$ sudo mkinitramfs /lib/modules/5.17.9/ -o /boot/initrd.img-5.17.9

更新 grub

$ sudo update-grub2

接著重新開機，從 UML 的安裝可知 memchr() 和開機有關，若核心有成功更新成 5.17.9 且能成功開機，則更新後的 memchr() 應該是沒有大問題。

輸入命令檢查核心版本

$ uname -r

得到以下輸出

5.17.9

核心版本成功更新。

接著一樣撰寫 kernel module 觀察 memchr-opt-x86 是否真的編譯入核心。

#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>

MODULE_LICENSE("GPL");
#define LBLOCKSIZE (sizeof(long))
void *memchr1(const void *cs, int c, size_t length)
{
	const unsigned char *src = (const unsigned char *)cs;
	unsigned char d = c;
	while (!IS_ALIGNED((long)src, sizeof(long))) {
		if (!length--)
			return NULL;
		if (*src == d)
			return (void *)src;
		src++;
	}
	if (length >= LBLOCKSIZE) {
		unsigned long mask = d << 8 | d;
		unsigned int i = 32;
		long xor, data;
		const long consta = 0xFEFEFEFEFEFEFEFF,
			   constb = 0x8080808080808080;

		/*
		 * Create a 8-bytes mask for word-wise comparing.
		 * For example, a mask for 'a' is 0x6161616161616161.
		 */

		mask = mask << 16 | mask;
		for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
			mask = (mask << i) | mask;
		/*
		 * We perform word-wise comparing with following operation:
		 * 	1. Xor the long word @src and @mask into @xor.
		 * 	2. Add @xor with @consta.
		 * 	3. ~@xor & @constb.
		 * 	4. Perform & with the result of step 2 and 3.
		 *
		 * Step 1 creates a byte which is 0 in the long word if 
		 * there is at least one target byte in it.
		 *
		 * Step 2 to Step 4 find if there is a byte with 0 in
		 * the long word.  
		 */
		asm volatile("1:\n\t"
			     "movq (%0),%1\n\t"
			     "xorq %6,%1\n\t"
			     "lea (%1,%4), %2\n\t"
			     "notq %1\n\t"
			     "andq %5,%1\n\t"
			     "testq %1,%2\n\t"
			     "jne 2f\n\t"
			     "add $8,%0\n\t"
			     "sub $8,%3\n\t"
			     "cmp $7,%3\n\t"
			     "ja 1b\n\t"
			     "2:\n\t"
			     : "=D"(src), "=r"(xor), "=r"(data), "=r"(length)
			     : "r"(consta), "r"(constb), "r"(mask), "0"(src),
			       "1"(xor), "2"(data), "3"(length)
			     : "memory", "cc");
	}

	while (length--) {
		if (*src == d)
			return (void *)src;
		src++;
	}
	return NULL;
}

static int hello_init(void)
{
    char str[1000];
    char ch = 'x';
    long time,time1;
    char *ret, *ret2;
    ktime_t kt,kt2;
    memset(str, '0', sizeof(char)*1000);
    str[998] = 'x';
    str[999] = '\0';
    kt = ktime_get();
    ret = memchr1(str, ch, strlen(str));
    time = ktime_to_ns(ktime_sub(ktime_get(), kt));
    printk(KERN_ALERT "Hello World! - init\n");
    printk(KERN_ALERT "%s\n", ret);
    printk(KERN_ALERT "%ld\n", time);
    kt2 = ktime_get();
    ret2 = memchr1(str, ch, strlen(str));
    time1 = ktime_to_ns(ktime_sub(ktime_get(), kt2));
    printk(KERN_ALERT "%s\n", ret2);
    printk(KERN_ALERT "%ld\n", time1);
    return 0;
}

static void hello_exit(void)
{
    printk(KERN_ALERT "Hello World! - exit\n");
}

module_init(hello_init);
module_exit(hello_exit);

Makefile

obj-m += hello.o

PWD := $(shell pwd)
KDIR := /lib/modules/$(shell uname -r)/build

default:
	make -C $(KDIR) M=$(PWD) modules 

clean:
	make -C $(KDIR) M=$(PWD) clean

執行輸入下列命令將核心模組放入核心

$ sudo insmod hello.ko
$ sudo rmmod hello

得到以下結果

[ 1003.758432] Hello World! - init
[ 1003.758435] x
[ 1003.758436] 458
[ 1003.758437] x
[ 1003.758437] 462
[ 1005.920261] Hello World! - exit

可見核心的 memchr 和 memchr-opt-x86 執行時間相近，可以說成功放入 Linux Kernel。

今天發布 5.18 版，可能要再編譯一次核心並測試

經過重複上述步驟，在 5.18 Linux kernel 成功開機，利用上述的 kernel module 的結果也相似。

提交 patch

patch 內容

第一個 commit message 內容

x86/lib: Optimize memchr()

The original assembly version of memchr() is implemented with
the byte-wise comparing technique, which does not fully use 64-bits 
registers in x86-64 CPU. We use word-wide comparing so that 8 
characters can be compared at the same time on x86-64 CPU. First, 
we align the input and then use word-wise comparing to find the 
first 64-bit word that contain the target. Secondly, we compare 
every byte in the word and get the output.

We create two files to measure the performance. The first file
contains on average 10 characters ahead the target character.
The second file contains at least 1000 characters ahead the
target character. Our implementation of “memchr()” is slightly
better in the first test and nearly 4x faster than the orginal
implementation in the second test.

Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com>
Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>

第二個 commit message 內容

x86/um: Use x86_64-optimized memchr

Add x86_64-optimized memchr, which is 4x faster
than the original implementation, into um.

Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com>
Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>

第一個 commit 跑命令 ./scripts/checkpatch.pl -strict 出現以下訊息

CHECK: extern prototypes should be avoided in .h files
#42: FILE: arch/x86/include/asm/string_64.h:18:
+extern void *memchr(const void *cs, int c, size_t length);

WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
#59: 
new file mode 100644

WARNING 的部分確定是要新增檔案，因此可以不理會。 CHECK 的部分建議在 .h 中避免 extern prototypes ，我翻查 Linux Kernel 中其他 string.h 相關的 memchr() prototype ，皆是 extern prototype 。

按照老師的指示增加了 cover letter

Subject: [PATCH 0/2] x86: Optimize memchr() for x86-64 
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

*** BLURB HERE ***
These patch series add an optimized "memchr()" for x86-64 and 
USER-MODE LINUX (UML).
 
There exists an assemebly implementation for x86-32. However, 
for x86-64, there isn't any optimized version. We implement word-wise 
comparison so that 8 characters can be compared at the same time on 
x86-64 CPU. The optimized “memchr()” is nearly 4x faster than the 
orginal implementation for long strings.

We test the optimized “memchr()” in UML and also recompile the 5.18 
Kernel with the optimized “memchr()”. They run correctly.

In this patch we add a new file "string_64.c", which only contains 
"memchr()". We can add more optimized string functions in it in the 
future.

Yu-Jen Chang (2):
  x86/lib: Optimize memchr()
  x86/um: Use x86_64-optimized memchr

 arch/x86/include/asm/string_64.h |  3 ++
 arch/x86/lib/Makefile            |  1 +
 arch/x86/lib/string_64.c         | 78 ++++++++++++++++++++++++++++++++
 arch/x86/um/Makefile             |  2 +-
 4 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/lib/string_64.c

-- 
2.25.1

patch 收件者

第一份

$ ./scripts/get_maintainer.pl -f arch/x86/include/asm/string_64.h
Thomas Gleixner <tglx@linutronix.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Ingo Molnar <mingo@redhat.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Borislav Petkov <bp@alien8.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Dave Hansen <dave.hansen@linux.intel.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
x86@kernel.org (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
"H. Peter Anvin" <hpa@zytor.com> (reviewer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Kees Cook <keescook@chromium.org> (supporter:FORTIFY_SOURCE)
linux-kernel@vger.kernel.org (open list:X86 ARCHITECTURE (32-BIT AND 64-BIT))
linux-hardening@vger.kernel.org (open list:FORTIFY_SOURCE)

第二份

$ ./scripts/get_maintainer.pl -f arch/x86/um
Richard Weinberger <richard@nod.at> (maintainer:USER-MODE LINUX (UML))
Anton Ivanov <anton.ivanov@cambridgegreys.com> (maintainer:USER-MODE LINUX (UML))
Johannes Berg <johannes@sipsolutions.net> (maintainer:USER-MODE LINUX (UML))
Thomas Gleixner <tglx@linutronix.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Ingo Molnar <mingo@redhat.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Borislav Petkov <bp@alien8.de> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
Dave Hansen <dave.hansen@linux.intel.com> (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
x86@kernel.org (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
"H. Peter Anvin" <hpa@zytor.com> (reviewer:X86 ARCHITECTURE (32-BIT AND 64-BIT))
linux-um@lists.infradead.org (open list:USER-MODE LINUX (UML))
linux-kernel@vger.kernel.org (open list:X86 ARCHITECTURE (32-BIT AND 64-BIT))

第二份的收件人應該是 Jeff Dike (5/28 更正)， email 為 jdike@linux.intel.com 。第一份的收件人應為 Andi Kleen 。雖然他不在前述的名單上，之前用 git blame 查詢到他是相關檔案的維護者，他的 email 為 ak@linux.intel.com 。

email 傳送測試

安裝 git-email

$ sudo apt-get install git-email

設定 .gitconfig

[user]
	name = arthurchang09
	email = arthurchang09@gmail.com
[sendemail]
	smtpEncryption = tls
	smtpServer = smtp.gmail.com
	smtpServerPort = 587
	smtpUser = arthurchang09@gmail.com

由於我的 gmail 有設定雙重驗證，所以要透過 git 送 email 要開啟應用程式密碼，在手機開啟 Google帳戶 -> 安全性 -> 應用程式密碼，在此設定密碼。

接著輸入命令

git send-email --to arthurchang09@gmail.com \
--cc arthur09123456@gmail.com \
0001-x86-lib-Optimize-memchr.patch\ 
0002-um-Use-x86_64-optimized-memchr.patch

要輸入密碼時，輸入前面設定的密碼，抑或是在 .gitconfig 設定。
傳輸完成後在我的兩個信箱皆有收到信。

開發者的回覆

開發者有三點回復:

1. d = c 好像是 pointer 和值的比較，可能有問題
1. consta (0xFEFEFEFEFEFEFEFF) constb (0x8080808080808080) 的用途是什麼?
1. 組合語言有加法(consta)，是否需要考慮 overflow 的問題，即確認 CF flag 。另外 jne 指令是否要改為 jnz ，後者似乎更合適。

經過一番檢視，我的答案如下，目前尚未回覆給開發者：

1. 開發者可能看錯 d 的宣告方式，這應是 unsigned char 而非 unsigned char* ，一開始的 unsigned char* src 讓開發者以為 d 是 pointer ，實際上不是。(可能要指出 C 語言規格書的說明?)
1. 可能需要詳細解說演算法，才能解釋其功用
1. 這其實是在做減法，即減去 0x0101010101010101 ，只是為了減少步驟寫成加法。因此 CF flag 不需要處理。指令寫成 jnz 會更有助於理解，但其作用和 jne 相同，皆是檢查 Zero Flag ，會做修改。

Andi Kleen 則是要求指出在 Kernel 中具體的效益，目前在研究中

另一位開發者 David Laight 也有提出幾點意見:

1. alignment 似乎是不必要
1. for 迴圈是多餘的，只需保留其中的 mask |= mask << i
1. src 需要綁定 %rdi ?
1. 同時進行 read-write 的 register 要用 +r ，不要重複寫
1. You should also be able to code without needing add, sub and cmp at the end of the loop.
1. If you use negative offsets from the end of the buffer the loop can be a single add and jnz.

我經過測試，發現應該是要將 length 變成 8 的倍數，因此將 IS_ALIGN() 巨集改成 length & LBLOCKSIZE 。 if 條件判斷中將 for 迴圈去掉，將 inline assembly 做一些調整並改寫了一小部分。經過 UML 測試能成功啟動。但對於 David Laight 最後兩點還是不知如何達成。

我去掉了 alignment 的部分，並且新增一個變數 dst ，為執行 word-wise comparison 的上界。因此在組合語言中，不必對 length 做減法，只需比較 src 和 dst 之大小關係，省下一個指令，等於每跑一次迴圈，減少執行一個指令。

// SPDX-License-Identifier: GPL-2.0
#include <linux/string.h>
#include <linux/export.h>
#include <linux/align.h>

/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))

#ifdef __HAVE_ARCH_MEMCHR

void *memchr(const void *cs, int c, size_t length)
{
	const unsigned char *src = (const unsigned char *)cs;
	const unsigned char *end = src + length;

	if (length >= LBLOCKSIZE) {
		unsigned long mask = c << 8 | c;
		long xor, data;
		const long consta = 0xFEFEFEFEFEFEFEFF,
			   constb = 0x8080808080808080;

		const unsigned char *dst = (const unsigned char *)src +
					   (length & 0xFFFFFFFFFFFFFFF8);

		/*
		 * Create a 8-bytes mask for word-wise comparing.
		 * For example, a mask for 'a' is 0x6161616161616161.
		 */

		mask |= mask << 16;
		mask |= mask << 32;
		/*
		 * We perform word-wise comparing with following operation:
		 *	1. Perform xor on the long word @src and @mask
		 *	   and put into @xor.
		 *	2. Add @xor with @consta.
		 *	3. ~@xor & @constb.
		 *	4. Perform & with the result of step 2 and 3.
		 *
		 * Step 1 creates a byte which is 0 in the long word if
		 * there is at least one target byte in it.
		 *
		 * Step 2 to Step 4 find if there is a byte with 0 in
		 * the long word.
		 */
		asm volatile("1:\n\t"
			     "movq (%[src]),%[xor]\n\t"
			     "xorq %[mask],%[xor]\n\t"
			     "lea (%[xor],%[const_a]), %[tmp]\n\t"
			     "notq %[xor]\n\t"
			     "andq %[const_b],%[xor]\n\t"
			     "testq %[xor],%[tmp]\n\t"
			     "jnz 2f\n\t"
			     "add $8,%[src]\n\t"
			     "cmp %[src], %[dst]\n\t"
			     "ja 1b\n\t"
			     "2:\n\t"
			     :
			     [src] "+r"(src), [xor] "+r"(xor), [tmp] "+r"(data)
			     : [const_a] "r"(consta), [const_b] "r"(constb),
			       [mask] "r"(mask), [dst] "r"(dst)
			     : "memory", "cc");
	}

	while (src <= end) {
		if (*src == c)
			return (void *)src;
		src++;
	}
	return NULL;
}
EXPORT_SYMBOL(memchr);
#endif

David Laight 將我的程式碼改寫，整題而言較為簡潔易懂，沒有組合語言。也解決 target character 為負時會出錯的問題。我認文可以當最終版本

void *memchr(const void *p, int c, unsigned long length)
{
    unsigned long mask, val;
    const void *end = p + length;

    c &= 0xff;
    if (p <= end - 8) {
        mask = c | c << 8;
        mask |= mask << 16;
        mask |= mask << 32;

        for (; p <= end - 8; p += 8) {
            val = *(unsigned long *)p ^ mask;
            if ((val + 0xfefefefefefefeffu) & (~val & 0x8080808080808080u))
                break;
        }
    }

    for (; p < end; p++)
        if (*(unsigned char *)p == c)
            return p;

    return NULL;
}

Linux Kernel UML 開機使用的 memchr 長度

在程式碼中加入

printk("le %ld\n", length);

發現執行 UML 時會卡住，因此改寫成

if (length >= 100){
    printk("le %d\n", length);
}

接著先執行先前編譯好的 UML 執行檔

$ ./linux

擷取部分輸出

...
Linux version 5.18.0 (hihihi@hihihi-VirtualBox) (gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #37 Mon Jun 6 17:46:51 CST 2022
l 162,10
...
Memory: 26132K/50688K available (3315K kernel code, 1002K rwdata, 1112K rodata, 143K init, 166K bss, 24556K reserved, 0K cma-reserved)
l 134,10
...
Console initialized on /dev/tty0
l 162,10
l 134,10
...

可看到 call memchr() 時，字串的長度並不長，大約只有不到 200 個字元。根據先前的測試，優化過的 memchr() 的效能優化並不明顯。

[    0.000000] le 177
[    0.000000] Command line: BOOT_IMAGE=/boot/vmlinuz-5.18.0 root=UUID=43f42c7b-5930-4dc9-b2f4-48f8220dd500 ro quiet splash
[    0.000000] le 108
...

[    3.620183] audit: type=1400 audit(1654591700.584:10): apparmor="STATUS" operation="profile_load" profile="unconfined" name="/usr/lib/connman/scripts/dhclient-script" pid=579 comm="apparmor_parser"
[    3.830941] le 177
[    3.830943] le 108
[    3.830953] le 104
[    3.830971] le 103
[    3.830975] le 115
[    3.830976] le 110
[    3.830978] le 142
[    3.831234] le 104
[    3.831237] le 132
[    3.831272] le 115
[    3.831663] le 123
[    3.831823] le 114
[    3.831824] le 120
[    3.831825] le 241
[    3.831827] le 227
[    3.831847] le 150
[    3.831850] le 101
[    3.831851] le 156
[    3.831945] le 109
[    3.831947] le 166
[    3.831948] le 161
[    3.831949] le 155
[    3.831950] le 174
[    3.831951] le 159
[    3.831951] le 172
[    3.831952] le 189
[    3.831953] le 182
[    3.831954] le 185
[    4.129413] e1000: enp0s3 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX
...

[    9.045064] 08:48:26.011645 main     6.1.14 r140239 started. Verbose level = 0
[    9.045492] le 127
[    9.045495] le 126
[    9.046254] 08:48:26.012759 main     vbglR3GuestCtrlDetectPeekGetCancelSupport: Supported (#1)
[    9.186262] systemd-journald[256]: File /var/log/journal/6196ff73bdf74ff4ae61063ebf32f0c3/user-1000.journal corrupted or uncleanly shut down, renaming and replacing.
[    9.189583] le 153
[   10.149762] Service: Shared Clipboard


[   15.078903] le 177
[   15.078907] le 108
[   15.078915] le 104
[   15.078928] le 103
[   15.078931] le 115
[   15.078932] le 110
[   15.078933] le 142
[   15.078933] le 177
[   15.078935] le 108
[   15.078939] le 104
[   15.078943] le 104
[   15.078949] le 132
[   15.078962] le 103
[   15.078965] le 115
[   15.078966] le 110
[   15.078968] le 142
[   15.078973] le 104
[   15.078976] le 132
[   15.078976] le 115
[   15.079002] le 115
[   15.079014] le 123
[   15.079040] le 123
[   15.079041] le 114
[   15.079042] le 120
[   15.079043] le 241
[   15.079044] le 227
[   15.079060] le 150
[   15.079062] le 101
[   15.079063] le 156
[   15.079067] le 114
[   15.079067] le 120
[   15.079069] le 241
[   15.079070] le 109
[   15.079070] le 227
[   15.079072] le 166
[   15.079073] le 161
[   15.079074] le 155
[   15.079074] le 174
[   15.079075] le 159
[   15.079076] le 172
[   15.079076] le 189
[   15.079077] le 182
[   15.079078] le 185
[   15.079085] le 127
[   15.079086] le 150
[   15.079087] le 126
[   15.079088] le 101
[   15.079089] le 153
[   15.079089] le 156
[   15.079096] le 109
[   15.079098] le 166
[   15.079099] le 161
[   15.079100] le 155
[   15.079100] le 174
[   15.079101] le 159
[   15.079102] le 172
[   15.079102] le 189
[   15.079103] le 182
[   15.079104] le 185
[   15.079111] le 127
[   15.079113] le 126
[   15.079114] le 177
[   15.079115] le 153
[   15.079115] le 108
[   15.079122] le 104
[   15.079136] le 103
[   15.079139] le 115
[   15.079139] le 110
[   15.079141] le 142
[   15.079141] le 177
[   15.079142] le 108
[   15.079147] le 104
[   15.079149] le 132
[   15.079150] le 104
[   15.079163] le 103
[   15.079166] le 115
[   15.079167] le 110
[   15.079168] le 142
[   15.079174] le 104
[   15.079176] le 115
[   15.079176] le 132
[   15.079202] le 115
[   15.079213] le 123
[   15.079239] le 114
[   15.079240] le 123
[   15.079240] le 120
[   15.079242] le 241
[   15.079243] le 227
[   15.079258] le 150
[   15.079260] le 101
[   15.079261] le 156
[   15.079266] le 114
[   15.079267] le 120
[   15.079268] le 109
[   15.079268] le 241
[   15.079270] le 227
[   15.079270] le 166
[   15.079271] le 161
[   15.079272] le 155
[   15.079273] le 174
[   15.079273] le 159
[   15.079274] le 172
[   15.079275] le 189
[   15.079275] le 182
[   15.079276] le 185
[   15.079284] le 127
[   15.079285] le 126
[   15.079286] le 150
[   15.079288] le 101
[   15.079289] le 156
[   15.079289] le 153
[   15.079296] le 109
[   15.079298] le 166
[   15.079299] le 161
[   15.079299] le 155
[   15.079300] le 174
[   15.079301] le 159
[   15.079301] le 172
[   15.079302] le 189
[   15.079303] le 182
[   15.079303] le 185
[   15.079311] le 127
[   15.079313] le 126
[   15.079316] le 153
[   75.957408] le 177
[   75.957411] le 108
[   75.957419] le 104
[   75.957433] le 103
[   75.957435] le 115
[   75.957436] le 110
[   75.957437] le 142
[   75.957443] le 104
[   75.957445] le 132
[   75.957472] le 115
[   75.957509] le 123
[   75.957535] le 114
[   75.957536] le 120
[   75.957537] le 241
[   75.957539] le 227
[   75.957554] le 150
[   75.957556] le 101
[   75.957557] le 156
[   75.957563] le 109
[   75.957565] le 166
[   75.957566] le 161
[   75.957567] le 155
[   75.957567] le 174
[   75.957568] le 159
[   75.957568] le 172
[   75.957569] le 189
[   75.957570] le 182
[   75.957570] le 185
[   75.957578] le 127
[   75.957580] le 126
[   75.957581] le 153
[   75.957627] le 177
[   75.957628] le 108
[   75.957635] le 104
[   75.957648] le 103
[   75.957651] le 115
[   75.957652] le 110
[   75.957653] le 142
[   75.957659] le 104
[   75.957661] le 132
[   75.957687] le 115
[   75.957724] le 123
[   75.957750] le 114
[   75.957751] le 120
[   75.957752] le 241
[   75.957753] le 227
[   75.957768] le 150
[   75.957771] le 101
[   75.957771] le 156
[   75.957778] le 109
[   75.957780] le 166
[   75.957781] le 161
[   75.957781] le 155
[   75.957782] le 174
[   75.957782] le 159
[   75.957783] le 172
[   75.957784] le 189
[   75.957784] le 182
[   75.957785] le 185
[   75.957793] le 127
[   75.957794] le 126
[   75.957796] le 153
[   75.993516] le 177
[   75.993520] le 108
[   75.993530] le 104
[   75.993549] le 103
[   75.993554] le 115
[   75.993554] le 110
[   75.993556] le 142
[   75.993564] le 104
[   75.993568] le 132
[   75.993609] le 115
[   75.993668] le 123
[   75.993711] le 114
[   75.993712] le 120
[   75.993714] le 241
[   75.993717] le 227
[   75.993742] le 150
[   75.993745] le 101
[   75.993747] le 156
[   75.993757] le 109
[   75.993759] le 166
[   75.993760] le 161
[   75.993761] le 155
[   75.993762] le 174
[   75.993763] le 159
[   75.993764] le 172
[   75.993765] le 189
[   75.993766] le 182
[   75.993766] le 185
[   75.993779] le 127
[   75.993782] le 126
[   75.993784] le 153
[   75.993885] le 177
[   75.993887] le 108
[   75.993898] le 104
[   75.993920] le 103
[   75.993925] le 115
[   75.993926] le 110
[   75.993928] le 142
[   75.993937] le 104
[   75.993941] le 132
[   75.993987] le 115
[   75.994049] le 123
[   75.994091] le 114
[   75.994092] le 120
[   75.994094] le 241
[   75.994097] le 227
[   75.994121] le 150
[   75.994124] le 101
[   75.994126] le 156
[   75.994136] le 109
[   75.994139] le 166
[   75.994140] le 161
[   75.994141] le 155
[   75.994142] le 174
[   75.994143] le 159
[   75.994144] le 172
[   75.994145] le 189
[   75.994146] le 182
[   75.994147] le 185
[   75.994159] le 127
[   75.994161] le 126
[   75.994164] le 153

以上是將化的 memchr() 重新編u4並放入 kernel ，開機後的結果。同樣最多只有 200 多的字元。

目前 arch/x86 目錄中，針對 x86-64 調整過的實作是 memcpy, memset, memmove，但其他的 mem/str 系列函式仍採用 generic 實作。也許可查閱 arch/x86/lib/string_32.c 以確認在 Linux 核心使用的狀況。

我接著在 /lib/string.c 的 memchr 增加程式碼

void *memchr(const void *s, int c, size_t n)
{
+	if (n >= 100) {
+		printk("lib %ld\n", n);
+	}
	const unsigned char *p = s;
	while (n-- != 0) {
        	if ((unsigned char)c == *p++) {
			return (void *)(p - 1);
		}
	}
	return NULL;
}

得到的結果類似先前的測試，顯示 generic 實作似乎沒有被使用。目前看起來 memchr() 的使用狀況看我之前的觀察類似，透過 #define __HAVE_ARCH_MEMCHR 決定使用哪種 memchr() 。由於前述尋找 memchr() 皆是在虛擬機上運行。接下來會嘗試在雙系統重新編譯 kernel 並且測試使否有所改變。

嘗試繼續尋找，在 /drivers/misc/lkdtm/heap.c 第 207 行找到






	if (memchr(val, 0xAB, 512) == NULL) {
		pr_info("Memory appears initialized (%x, no earlier values)\n", *val);
	} else {
		pr_err("FAIL: Slab was not initialized\n");
		pr_expected_config_param(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, "init_on_alloc");
	}

這裡可以看到要搜尋 512 byte

更改貢獻方式，整合 `/lib/string.c` 的 `memchr()` 與 `memchr_inv`

跟老師討論後決定改成貢獻 lib/string.c 中的 memchr() ，並且和其功能相似但作用相反的涵式 memchr_inv() 進行 macro 上的整合。

分析 `memchr_inv()`

功能為找出第一個和不標字元不相同之字元位址，也就是 memchr() 相反的功能。

在 /lib/string.c 的實作如下

static void *check_bytes8(const u8 *start, u8 value, unsigned int bytes)
{
	while (bytes) {
		if (*start != value)
			return (void *)start;
		start++;
		bytes--;
	}
	return NULL;
}

void *memchr_inv(const void *start, int c, size_t bytes)
{
	u8 value = c;
	u64 value64;
	unsigned int words, prefix;

	if (bytes <= 16)
		return check_bytes8(start, value, bytes);

	value64 = value;
#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
	value64 *= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
	value64 *= 0x01010101;
	value64 |= value64 << 32;
#else
	value64 |= value64 << 8;
	value64 |= value64 << 16;
	value64 |= value64 << 32;
#endif

	prefix = (unsigned long)start % 8;
	if (prefix) {
		u8 *r;

		prefix = 8 - prefix;
		r = check_bytes8(start, value, prefix);
		if (r)
			return r;
		start += prefix;
		bytes -= prefix;
	}

	words = bytes / 8;

	while (words) {
		if (*(u64 *)start != value64)
			return check_bytes8(start, value, 8);
		start += 8;
		words--;
	}

	return check_bytes8(start, value, bytes % 8);
}

這裡可看到在字串長度小於 16 字元時，直接呼叫 check_bytes8() 尋找第一個不符合的字元。

接著使用一連串的 #if defined 來建立 mask 。如果是 64-bits CPU ，其 BITS_PER_LONG 會是 64 ，以及有 ARCH_HAS_FAST_MULTIPLIER ，直接用乘法建立，像我的 CPU 屬於此類。若是 32-bits 且有 ARCH_HAS_FAST_MULTIPLIER ，會先建立 32-bits mask 再用 | 建成 64-bits mask。若沒有 ARCH_HAS_FAST_MULTIPLIER 則都採用 | 建立。

值得注意的是無論是 32-bits 或是 64-bits ， memchr_inv() 一律採用 64-bits 的 mask 。然而在 32-bits CPU ， u64 會需要使用兩個 register ，這樣執行效能是否和採用 unsigned int 效能一樣需要近一步的測試。

使用 complier explorer 並下 -m32 查看 32 bit 和 64 bits 的差別。在 32 bits 下都會用到較多的指令，如：

    value64 = value;

32 bits

        movzx   eax, BYTE PTR [ebp-13]
        mov     DWORD PTR [ebp-24], eax
        mov     DWORD PTR [ebp-20], 0

64 bits

        movzx   eax, BYTE PTR [rbp-5]
        mov     QWORD PTR [rbp-16], rax

接下來 prefix 變數是用來處理 alignment ，將 unalign 的字元先處理掉。

        while (words) {
            if (*(u64 *)start != value64)
                return check_bytes8(start, value, 8);
                start += 8;
                words--;
        }

接著 while 迴圈對長度為 8 bytes 的字串進行比較，若有不同則呼叫 check_bytes8() 尋找第一個不同的字元的位址。

最後再處理長度不到 8 bytes 的字串。

找出 `memchr` 和 `memchr_inv` 共同處並合併

兩者最明顯的共同之處為皆是要產生一個 word 長度的 mask ，因此可以寫成一個巨集，如下：

#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
    #define CREATE_MASK(val) \
        val*= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
    #define CREATE_MASK(val) \
        val *= 0x01010101; \
        val |= val << 32;
#else
    #define CREATE_MASK(val) \
        val |= val << 8; \
        val |= val << 16; \
        val |= val << 32;
#endif

巨集名稱 CREATE_MASK 可能還要修改，目前先暫訂如此進行簡單測試。

由於 memchr_inv() 無論是 32-bit 或是 64-bit 皆是採用 u64 的型態進行處理，因此在 memchr() 也打算用同樣的方式，才能使用共同的巨集。

memchr() 變成如下

#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
    #define CREATE_MASK(val) \
        val*= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
    #define CREATE_MASK(val) \
        val *= 0x01010101; \
        val |= val << 32;
#else
    #define CREATE_MASK(val) \
        val |= val << 8; \
        val |= val << 16; \
        val |= val << 32;
#endif

void *memchr(const void *p, int c, unsigned long length)
{
    unsigned long long mask, val;
    const void *end = p + length;

    c &= 0xff;
    if (p <= end - 8) {
        mask = c;
        CREATE_MASK(mask);

        for (; p <= end - 8; p += 8) {
            val = *(unsigned long *)p ^ mask;
            if ((val + 0xfefefefefefefeffu) & (~val & 0x8080808080808080u))
                break;
        }
    }

    for (; p < end; p++)
        if (*(unsigned char *)p == c)
            return (void *)p;

    return NULL;
}

使用以下命令編譯成 x86-32 進行測試

gcc -m32 -O2 -o a.out test.c

成功通過之前使用的正確性測試

接著在 UML 測試修改過的 /lib/string.c 能否通過 UML 開機測試。

#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64

#define CREATE_CHAR_MASK(val) val *= 0x0101010101010101ULL;

#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)

#define CREATE_CHAR_MASK(val)                                                  \
	val *= 0x01010101;                                                     \
	val |= val << 32;

#else

#define CREATE_CHAR_MASK(val)                                                  \
	val |= val << 8;                                                       \
	val |= val << 16;                                                      \
	val |= val << 32;

#endif

#ifndef __HAVE_ARCH_MEMCHR
/**
 * memchr - Find a character in an area of memory.
 * @s: The memory area
 * @c: The byte to search for
 * @n: The size of the area.
 *
 * returns the address of the first occurrence of @c, or %NULL
 * if @c is not found
 */

void *memchr(const void *p, int c, unsigned long length)
{
	u64 mask, val;
	const void *end = p + length;

	c &= 0xff;
	if (p <= end - 8) {
		mask = c;
		CREATE_CHAR_MASK(mask);

		for (; p <= end - 8; p += 8) {
			val = *(unsigned long *)p ^ mask;
			if ((val + 0xfefefefefefefeffu) &
			    (~val & 0x8080808080808080u))
				break;
		}
	}

	for (; p < end; p++)
		if (*(unsigned char *)p == c)
			return (void *)p;

	return NULL;
}
EXPORT_SYMBOL(memchr);
#endif

void *memchr_inv(const void *start, int c, size_t bytes)
{
	u8 value = c;
	u64 value64;
	unsigned int words, prefix;

	if (bytes <= 16)
		return check_bytes8(start, value, bytes);

	value64 = value;
	CREATE_CHAR_MASK(value64);

	prefix = (unsigned long)start % 8;
	if (prefix) {
		u8 *r;

		prefix = 8 - prefix;
		r = check_bytes8(start, value, prefix);
		if (r)
			return r;
		start += prefix;
		bytes -= prefix;
	}

	words = bytes / 8;

	while (words) {
		if (*(u64 *)start != value64)
			return check_bytes8(start, value, 8);
		start += 8;
		words--;
	}

	return check_bytes8(start, value, bytes % 8);
}
EXPORT_SYMBOL(memchr_inv);

重新編譯 kernel

依照之前重新編譯 kernel 的方式將上方修改過的 memchr() 以及 memchr_inv() ，成功重新開機。

然而，重新編譯只編譯 64-bit 的 kernel ， 32-bit kernel 並沒有測試到。由於 Ubuntu 已經沒有 32 bit 的映像檔，因此我選用 Linux Mint 32-bit 來對kernel 進行重新編譯。

首先去官網下載 32-bit 版本。使用 Virtual Box 安裝時選擇 Ubuntu 32-bit 即可。
從 lscpu 可看到

CPU op-mode(s):                  32-bit

對比 64-bit Ubuntu

CPU op-mode(s):                  32-bit, 64-bit

這應該是 32-bit 的 Linux 沒錯。

接著重複之前的重新編譯 kernel 的步驟，並註解掉 /arch/x86/include/asm/string_32.h 中的 memchr() ，以免使用這塊的 memchr() 而非 /lib/string.c 中的版本。

另外像之前的測試一樣加入 printk("str %u\n", length) 觀察是否真的執行到 memchr() 以及字串的長度。

重新編譯好後， Linux Mint 成功重新開機，輸入命令 dmesg 也觀察到 memch() 的輸出，和 64 bit 版本差不多。至此可以確定 32-bit Linux 下 memchr() 能正確運行。

git blame 查看 `memchr()`

透過git blame 可發現 mmemchr() 是 Linus Torvalds 撰寫的，而 memchr_inv 是 Akinobu Mita 所撰寫。

^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 869) #ifndef __HAVE_ARCH_MEMCHR
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 870) /**
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 871)  * memchr - Find a character in an area of memory.
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 872)  * @s: The memory area
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 873)  * @c: The byte to search for
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 874)  * @n: The size of the area.
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 875)  *
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 876)  * returns the address of the first occurrence of @c, or %NULL
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 877)  * if @c is not found
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 878)  */
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 879) void *memchr(const void *s, int c, size_t n)
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 880) {
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 881)   const unsigned char *p = s;
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 882)   while (n-- != 0) {
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 883)           if ((unsigned char)c == *p++) {
51a0f0f658b0e (Jesper Juhl     2005-10-30 15:02:11 -0800 884)                   return (void *)(p - 1);
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 885)           }
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 886)   }
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 887)   return NULL;
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 888) }
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 889) EXPORT_SYMBOL(memchr);
^1da177e4c3f4 (Linus Torvalds  2005-04-16 15:20:36 -0700 890) #endif

798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 912) void *memchr_inv(const void *start, int c, size_t bytes)
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 913) {
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 914)   u8 value = c;
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 915)   u64 value64;
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 916)   unsigned int words, prefix;
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 917) 
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 918)   if (bytes <= 16)
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 919)           return check_bytes8(start, value, bytes);
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 920) 
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 921)   value64 = value;
72d931046030b (Linus Torvalds  2014-09-13 11:14:53 -0700 922) #if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
3368e8fbcda53 (Andy Shevchenko 2015-11-10 14:45:14 -0800 923)   value64 *= 0x0101010101010101ULL;
72d931046030b (Linus Torvalds  2014-09-13 11:14:53 -0700 924) #elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 925)   value64 *= 0x01010101;
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 926)   value64 |= value64 << 32;
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 927) #else
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 928)   value64 |= value64 << 8;
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 929)   value64 |= value64 << 16;
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 930)   value64 |= value64 << 32;
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 931) #endif
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 932) 
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 933)   prefix = (unsigned long)start % 8;
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 934)   if (prefix) {
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 935)           u8 *r;
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 936) 
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 937)           prefix = 8 - prefix;
f43804bf5f9ae (Akinobu Mita    2012-03-23 15:02:14 -0700 938)           r = check_bytes8(start, value, prefix);
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 939)           if (r)
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 940)                   return r;
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 941)           start += prefix;
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 942)           bytes -= prefix;
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 943)   }
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 944) 
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 945)   words = bytes / 8;
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 946) 
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 947)   while (words) {
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 948)           if (*(u64 *)start != value64)
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 949)                   return check_bytes8(start, value, 8);
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 950)           start += 8;
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 951)           words--;
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 952)   }
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 953) 
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 954)   return check_bytes8(start, value, bytes % 8);
798248206b59a (Akinobu Mita    2011-10-31 17:08:07 -0700 955) }

產生 patch

patch 1/2:

Subject: [PATCH 1/2] lib/string.c: Add a macro for memchr_inv()

We add a macro MEMCHR_MASK_GEN() so that both memchr_inv()
and memchr() can use it to generate a 8 bytes mask.

Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com>
Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>

patch 2/2:

Subject: [PATCH 2/2] lib/string.c: Optimize memchr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The original version of memchr() is implemented with the byte-wise
comparing technique, which does not fully use 64-bits or 32-bits
registers in CPU. We use word-wide comparing so that 8 characters
can be compared at the same time on CPU. This code is base on
David Laight's implementation.

We create two files to measure the performance. The first file
contains on average 10 characters ahead the target character.
The second file contains at least 1000 characters ahead the
target character. Our implementation of “memchr()” is slightly
better in the first test and nearly 4x faster than the orginal
implementation in the second test.

Signed-off-by: Yu-Jen Chang <arthurchang09@gmail.com>
Signed-off-by: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>

製作 patch 時，出現以下 CHECK

CHECK: Macro argument reuse 'mask' - possible side-effects?
#29: FILE: lib/string.c:888:
+#define MEMCHR_MASK_GEN(mask)                                                  \
+	do {                                                                   \
+		mask *= 0x01010101;                                            \
+		mask |= mask << 32;                                            \
+	} while (0)

CHECK: Macro argument 'mask' may be better as '(mask)' to avoid precedence issues
#29: FILE: lib/string.c:888:
+#define MEMCHR_MASK_GEN(mask)                                                  \
+	do {                                                                   \
+		mask *= 0x01010101;                                            \
+		mask |= mask << 32;                                            \
+	} while (0)

按照指示修該產生以下訊息

CHECK: Macro argument reuse 'mask' - possible side-effects?
#83: FILE: lib/string.c:888:
+#define MEMCHR_MASK_GEN(mask)                                                  \
+	do {                                                                   \
+		(mask) *= 0x01010101;                                          \
+		(mask) |= mask << 32;                                          \
+	} while (0)

參考 Macro Pitfalls

第一種情況為優先權問題，假設有一 macro 如下:

#define ceil_div(x, y) (x + y - 1) / y

用以下方法呼叫

 a = ceil_div (b & c, sizeof (int));

展開如下

a = (b & c + sizeof (int) - 1) / sizeof (int);

然而，考慮到優先程度，實際上會用以下方式運行

a = (b & (c + sizeof (int) - 1)) / sizeof (int);

而非

a = ((b & c) + sizeof (int) - 1)) / sizeof (int);

第二種情況是擔心 mask 可能是 fuction 的回傳值，在 macro 中重複使用會造成多次呼叫 function 。

然而，MEMCHR_MASK_GEN 這個 macro 只接受變數輸入，是將 mask 中的值展開成 8 bytes ，不可能牽涉到優先權以及 function 呼叫，因此我認為不須理會以上的 CHECK 。

提交 patch 對象

輸入下列命令後

./scripts/get_maintainer.pl 0001-lib-string.c-Add-a-macro-for-memchr_inv.patch

Andy Shevchenko <andy@kernel.org> (reviewer:GENERIC STRING LIBRARY)
linux-kernel@vger.kernel.org (open list)

因此寄給的對象可能是 memchr_inv() 的作者 Akinobu Mita akinobu.mita@gmail.com 以及 Andy Shevchenko andy@kernel.org

之後的修改

根據老師的提示，有些處理器架構，如：ARM ，並沒有對 unaligned data 進行處理時會有效率問題，因此要針對這一點做處理，也是開發者們在乎的問題。

#define LBLOCKSIZE (sizeof(long))
#if BITS_PER_LONG == 64

#define DETECT_CHAR(X)                                                         \
	(((X)-0x0101010101010101ULL) & ~(X)&0x8080808080808080ULL)

#define MEMCHR_MASK_GEN(mask)                                                  \
	do {                                                                   \
		(mask) |= (mask) << 8;                                         \
		(mask) |= (mask) << 16;                                        \
		(mask) |= (mask) << 32;                                        \
	} while (0)

#else

#define DETECT_CHAR(X) (((X)-0x01010101) & ~(X)&0x80808080)

#define MEMCHR_MASK_GEN(mask)                                                  \
	do {                                                                   \
		(mask) |= (mask) << 8;                                         \
		(mask) |= (mask) << 16;                                        \
	} while (0)

#endif

void *memchr(const void *p, int c, size_t length)
{
	unsigned long mask, val;
	const void *end = p + length;
	c &= 0xff;
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
	while ((long)p & (sizeof(long) - 1)) {
		if (p >= end)
			return NULL;
		if (*(unsigned char *)p == c)
			return (void *)p;
		p++;
	}
#endif
	if (p <= end - LBLOCKSIZE) {
		mask = c;
		MEMCHR_MASK_GEN(mask);

		for (; p <= end - LBLOCKSIZE; p += LBLOCKSIZE) {
			val = *(unsigned long *)p ^ mask;
			if (DETECT_CHAR(val))
				break;
		}
	}

	for (; p < end; p++)
		if (*(unsigned char *)p == c)
			return (void *)p;

	return NULL;
}
EXPORT_SYMBOL(memchr);

#if BITS_PER_LONG == 64

#define MEMCHR_MASK_GEN(mask)                                                  \
	do {                                                                   \
		(mask) |= (mask) << 8;                                         \
		(mask) |= (mask) << 16;                                        \
		(mask) |= (mask) << 32;                                        \
	} while (0)

#else

#define MEMCHR_MASK_GEN(mask)                                                  \
	do {                                                                   \
		(mask) |= (mask) << 8;                                         \
		(mask) |= (mask) << 16;                                        \
	} while (0)

#endif

void *memchr(const void *p, int c, size_t length)
{
    unsigned long mask;
    unsigned char *src = (unsigned char *) p;
    const void *end = p + length;
    const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
    size_t max = length;
    c &= 0xff;

    #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
	/*
	 * If src is unaligned, don't cross a page boundary,
	 * since we don't know if the next page is mapped.
	 */
    if ((long)p & (sizeof(long) - 1)) {
		size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1));

		if (limit < max)
			max = limit;
	}
  
    #else
        /* If src is unaligned, don't do word-at-a-time. */
        if (((long) p) & (sizeof(long) - 1))
            max = 0;
    #endif
    mask = c;
    MEMCHR_MASK_GEN(mask);
    for ( ; max >= sizeof(unsigned long); max -= sizeof(unsigned long), src += sizeof(unsigned long)) {
        unsigned long data, result;
        data = read_word_at_a_time(src);
        data = data ^ mask;
        if (has_zero(data, &result, &constants)) {
            break;
        }
    }

    for (; src < end; src++)
        if (*src == c)
            return (void *)src;

    return NULL;
}

Jim Huang

2022/04/13 06:33:08

###

另一個 memchr 的效能評比程式: https://gist.github.com/nico/2624336 (Edited)

2022/04/13 06:34:33

strchr 和 memchr 的效能比較: http://neilmitchell.blogspot.com/2017/12/benchmarking-strchr-vs-memchr.html (Edited)

2022/04/24 04:42:45

```c asm volatile( "testq $7,%0\n\t" "je end_align\n\t" "loop:\n\t" "length:\n\t" "decq %1\n\t" "testq %1,%1\n\t" "jne equal\n\t" "mov $0,%0\n\t" "jmp end\n\t" "equal:" "mov (%0),%2\n\t" "cmp %%al,%2\n\t" "je end\n\t" "incq %0\n\t" "align:\n\t" "testq $7,%0\n\t" "jne loop\n\t" "end_align:\n\t" : "=D" (src), "=&c" (length),"=r"(test) : "a" (c), "0" (src), "1" (length),"2"(test) : "memory","cc"); ```

依據 arch/x86/lib/memcpy_32.c 的註解風格，給予對應的程式碼說明 (Edited)

2022/04/29 13:54:28

不要用 time 來測量執行時間，你會把 ELF loader 和 VMA 的成本都算進去，請準備好 microbenchmarking (Edited)

2022/05/05 09:59:13

接下來測試開啟編譯器優化

可否移除下方程式碼的 unsigned long *asrc = (unsigned long *) src; 敘述？ (Edited)

2022/05/10 14:00:42

是

速度？ (Edited)

2022/05/18 08:03:46

修改成 `ja 1b\n\t` 即沒有問題。

TODO: 撰寫 correctness 驗證程式 (Edited)

2022/05/18 08:10:22

不過 UML 的測試反映重要訊息： memchr 函式在開機過程就會執行到，這讓驗證時間得以縮短 (Edited)

2022/05/18 12:21:13

修改後的結果如下:

參數 src_void 命名不夠簡潔 (Edited)

2022/06/08 06:16:08

hr()` 的效能優化並不明顯。 :

也許不要針對 arch/x86，而是放在架構無關的實作中 (Edited)

2022/06/08 06:23:58

這個測試不包含 drivers (Edited)

2022/06/27 13:23:16

可以寫成一個巨集

CREATE_MASK 不是好的命名，改為 MEMCHR_MASK_GEN (Edited)