# 2022q1 Homework5 (quiz8) answer contribute by < `arthurchang09` > # 測驗一 ```c= void *memchr_opt(const void *src_void, int c, size_t length) { const unsigned char *src = (const unsigned char *) src_void; unsigned char d = c; while (UNALIGNED(src)) { if (!length--) return NULL; if (*src == d) return (void *) src; src++; } if (!TOO_SMALL(length)) { /* If we get this far, we know that length is large and * src is word-aligned. */ /* The fast code reads the source one word at a time and only performs * the bytewise search on word-sized segments if they contain the search * character, which is detected by XORing the word-sized segment with a * word-sized block of the search character and then detecting for the * presence of NULL in the result. */ unsigned long *asrc = (unsigned long *) src; unsigned long mask = d << 8 | d; mask = mask << 16 | mask; for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1) mask = (mask << i) | mask; while (length >= LBLOCKSIZE) { /* XXXXX: Your implementation should appear here */ /* detect if there is a word that we want to find*/ if ( !DETECT_CHAR(*asrc, mask)) { /*if not, move to next 4byte*/ asrc += 1; length -= LBLOCKSIZE; } else { /*if we find the word, break the loop*/ break; } } /* If there are fewer than LBLOCKSIZE characters left, then we resort to * the bytewise loop. */ src = (unsigned char *) asrc; } while (length--) { if (*src == d) return (void *) src; src++; } return NULL; } ``` 第 5 行的 `while` 是確認字串是否對齊,若沒對齊則跳過這部分的輸入。 ```c while (UNALIGNED(src)) { if (!length--) return NULL; if (*src == d) return (void *) src; src++; } ``` 接著,若字串長度大於 `long` ,在 x86_64 處理器上資料寬度為 64 bits ,即 8 個字元,會進入第 12 行的 `if` 中。 下方的程式碼將要尋找的字元變成 64 bits 的 mask ,用來尋找 64 bits 中 是否有目標字元。假設要尋找 `.` 這個字元,其 16 進位為 `0x2e`,經過下方程式,`mask` 會變成 `0x2e2e2e2e2e2e2e2e` ```c unsigned long mask = d << 8 | d; mask = mask << 16 | mask; for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1) mask = (mask << i) | mask; ``` 在 28 行透過 `DETECT_CHAR` 判斷這 64 bits 是否存在目標字元,接著第 48 行再用 `while` 從這 64 bits 找到第一個目標字元並 `return`。 若都未找到,則回傳 `NULL`。 ## 比較 Linux 核心原本 (與平台無關) 的實作、 memchr_opt [Linux 核心原本 (與平台無關)](https://github.com/torvalds/linux/blob/master/lib/string.c#L892) 之程式碼如下 ```c void *memchr(const void *s, int c, size_t n) { const unsigned char *p = s; while (n-- != 0) { if ((unsigned char)c == *p++) { return (void *)(p - 1); } } return NULL; } ``` 其原理不複雜,使用 `while` 迴圈一個個走訪每一個字元,直到找到第一個目標字元。 在 [x86](https://github.com/torvalds/linux/blob/master/arch/x86/lib/string_32.c#L181) 平台有以下 inline assembly ```c=103 void *memchr(const void *cs, int c, size_t count) { int d0; void *res; if (!count) return NULL; asm volatile("repne\n\t" "scasb\n\t" "je 1f\n\t" "movl $1,%0\n" "1:\tdecl %0" : "=D" (res), "=&c" (d0) : "a" (c), "0" (cs), "1" (count) : "memory"); return res; } ``` 然而編譯時出現錯誤 ``` test.c: Assembler messages: test.c:112: Error: incorrect register `%rdi' used with `l' suffix test.c:113: Error: incorrect register `%rdi' used with `l' suffix ``` 根據 [AT&T Syntax versus Intel Syntax](http://web.mit.edu/rhel-doc/3/rhel-as-en-3/i386-syntax.html) , `movl` 和 `decl` 之 `l` 為 `long` (32 bits),依據 LP64,`long` 的資料寬度為 64 bits ,因此將上方程式碼稍作修改,將 `movl` `decl` 改成 `mov` `dec` 或 `movq` `decq`, `q` 為 quadruple word (64-bit) ,如下: ```c void *memchr_x86(const void *cs, int c, size_t count) { int d0; void *res; if (!count) return NULL; asm volatile("repne\n\t" "scasb\n\t" "je 1f\n\t" "mov $1,%0\n" "1:\tdec %0" : "=D" (res), "=&c" (d0) : "a" (c), "0" (cs), "1" (count) : "memory"); return res; } ``` 接著,我測試三者尋找目標字元所花費的時間。我宣告一個長度為 1000 的字元陣列,每個元素初始化為字元 `0` ,目標字元 `4` 會逐一放在第 0 個字元、第 1 個字元直到第 998 個字元,第 999 字元為 `\0`。採用 `struct timespec` 測量找到目標字元的時間。總共測量 120 次時間並去掉最小和最大各 10 個數值。 :::warning 將測試程式碼移到 GitHub 或 gist 上,讓開發紀錄更簡潔。 :notes: jserv ::: 以下為測試程式碼: :::spoiler Full Code ```c #include <stdio.h> #include <stdlib.h> #include <inttypes.h> #include <stdint.h> #include <stdbool.h> #include <string.h> #include <stddef.h> #include <limits.h> #include <time.h> #include <unistd.h> /* Nonzero if either X or Y is not aligned on a "long" boundary */ #define UNALIGNED(X) ((long) X & (sizeof(long) - 1)) /* How many bytes are loaded each iteration of the word copy loop */ #define LBLOCKSIZE (sizeof(long)) /* Threshhold for punting to the bytewise iterator */ #define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE) #if LONG_MAX == 2147483647L #define DETECT_NULL(X) (((X) -0x01010101) & ~(X) & 0x80808080) #else #if LONG_MAX == 9223372036854775807L /* Nonzero if X (a long int) contains a NULL byte. */ #define DETECT_NULL(X) (((X) -0x0101010101010101) & ~(X) & 0x8080808080808080) #else #error long int is not a 32bit or 64bit type. #endif #endif /* @return nonzero if (long)X contains the byte used to fill MASK. */ #define DETECT_CHAR(X, MASK) (DETECT_NULL(X ^ MASK)) #define LENGTH 120 #define CUT 10 #define STRLEN 1000 void *memchr_org(const void *s, int c, size_t n) { const unsigned char *p = s; while (n-- != 0) { if ((unsigned char)c == *p++) { return (void *)(p - 1); } } return NULL; } void *memchr_opt(const void *src_void, int c, size_t length) { const unsigned char *src = (const unsigned char *) src_void; unsigned char d = c; while (UNALIGNED(src)) { if (!length--) return NULL; if (*src == d) return (void *) src; src++; } if (!TOO_SMALL(length)) { /* If we get this far, we know that length is large and * src is word-aligned. */ /* The fast code reads the source one word at a time and only performs * the bytewise search on word-sized segments if they contain the search * character, which is detected by XORing the word-sized segment with a * word-sized block of the search character and then detecting for the * presence of NULL in the result. */ unsigned long *asrc = (unsigned long *) src; unsigned long mask = d << 8 | d; mask = mask << 16 | mask; for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1) mask = (mask << i) | mask; int index = 0; while (length >= LBLOCKSIZE) { /* XXXXX: Your implementation should appear here */ if ( !DETECT_CHAR(*asrc, mask)) { asrc += 1; length -= LBLOCKSIZE; } else break; } /* If there are fewer than LBLOCKSIZE characters left, then we resort to * the bytewise loop. */ src = (unsigned char *) asrc; } while (length--) { if (*src == d) return (void *) src; src++; } return NULL; } void *memchr_x86(const void *cs, int c, size_t count) { int d0; void *res; if (!count) return NULL; asm volatile("repne\n\t" "scasb\n\t" "je 1f\n\t" "mov $1,%0\n" "1:\tdec %0" : "=D" (res), "=&c" (d0) : "a" (c), "0" (cs), "1" (count) : "memory"); return res; } int cmp ( const void *a , const void *b ) { return *(int *)a > *(int *)b; } int main() { char str[STRLEN]; memset(str,'0', sizeof(char)*STRLEN); str[STRLEN - 1] = '\0'; char ch = '4'; int data[3][LENGTH]; memset(data,0,sizeof(data)); int sum[3]; memset(sum,0,sizeof(sum)); struct timespec t1,t2,t3,t4,t5,t6; for (int k = 0; k < STRLEN - 1; ++k){ memset(str,'0', sizeof(char)*STRLEN); str[STRLEN - 1] = '\0'; str[k] = '4'; memset(data,0,sizeof(data)); memset(sum,0,sizeof(sum)); for (int i = 0; i < LENGTH; ++i) { clock_gettime(CLOCK_REALTIME, &t1); char *ret = memchr_opt(str, ch, strlen(str)); clock_gettime(CLOCK_REALTIME, &t2); data[0][i] = t2.tv_nsec - t1.tv_nsec; clock_gettime(CLOCK_REALTIME, &t3); ret = memchr_x86(str, ch, strlen(str)); clock_gettime(CLOCK_REALTIME, &t4); data[1][i] = t4.tv_nsec - t3.tv_nsec; clock_gettime(CLOCK_REALTIME, &t5); memchr_org(str, ch, strlen(str)); clock_gettime(CLOCK_REALTIME, &t6); data[2][i] = t6.tv_nsec - t5.tv_nsec; usleep(1500); } printf("%d ",k); for (int i = 0; i < 3; ++i) { qsort(data[i],LENGTH,sizeof(int),cmp); for (int j = CUT; j < LENGTH - CUT; ++j) { sum[i] += data[i][j]; } printf("%d ", sum[i] / (LENGTH - (CUT << 1))); } printf("\n"); } return 0; } ``` ::: 下圖為在虛擬機上的結果: ![](https://i.imgur.com/4gJpnZU.png) :::warning TODO: 比對 glibc 的 `memchr` 實作: https://sourceware.org/git/?p=glibc.git;a=tree;f=sysdeps/x86_64 :notes: jserv ::: ### 引入 glibc 之 `memchr.S` glibc 之 [`memchr.S`](https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/memchr.S)如下: :::spoiler ```c /* Copyright (C) 2011-2018 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ #include <sysdep.h> #ifdef USE_AS_WMEMCHR # define MEMCHR wmemchr # define PCMPEQ pcmpeqd #else # define MEMCHR memchr # define PCMPEQ pcmpeqb #endif /* fast SSE2 version with using pmaxub and 64 byte loop */ .text ENTRY(MEMCHR) movd %esi, %xmm1 mov %edi, %ecx #ifdef USE_AS_WMEMCHR test %rdx, %rdx jz L(return_null) shl $2, %rdx #else punpcklbw %xmm1, %xmm1 test %rdx, %rdx jz L(return_null) punpcklbw %xmm1, %xmm1 #endif and $63, %ecx pshufd $0, %xmm1, %xmm1 cmp $48, %ecx ja L(crosscache) movdqu (%rdi), %xmm0 PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches_1) sub $16, %rdx jbe L(return_null) add $16, %rdi and $15, %ecx and $-16, %rdi add %rcx, %rdx sub $64, %rdx jbe L(exit_loop) jmp L(loop_prolog) .p2align 4 L(crosscache): and $15, %ecx and $-16, %rdi movdqa (%rdi), %xmm0 PCMPEQ %xmm1, %xmm0 /* Check if there is a match. */ pmovmskb %xmm0, %eax /* Remove the leading bytes. */ sar %cl, %eax test %eax, %eax je L(unaligned_no_match) /* Check which byte is a match. */ bsf %eax, %eax sub %rax, %rdx jbe L(return_null) add %rdi, %rax add %rcx, %rax ret .p2align 4 L(unaligned_no_match): /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void possible addition overflow. */ neg %rcx add $16, %rcx sub %rcx, %rdx jbe L(return_null) add $16, %rdi sub $64, %rdx jbe L(exit_loop) .p2align 4 L(loop_prolog): movdqa (%rdi), %xmm0 PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) movdqa 16(%rdi), %xmm2 PCMPEQ %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) movdqa 32(%rdi), %xmm3 PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) movdqa 48(%rdi), %xmm4 PCMPEQ %xmm1, %xmm4 add $64, %rdi pmovmskb %xmm4, %eax test %eax, %eax jnz L(matches0) test $0x3f, %rdi jz L(align64_loop) sub $64, %rdx jbe L(exit_loop) movdqa (%rdi), %xmm0 PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) movdqa 16(%rdi), %xmm2 PCMPEQ %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) movdqa 32(%rdi), %xmm3 PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) movdqa 48(%rdi), %xmm3 PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax add $64, %rdi test %eax, %eax jnz L(matches0) mov %rdi, %rcx and $-64, %rdi and $63, %ecx add %rcx, %rdx .p2align 4 L(align64_loop): sub $64, %rdx jbe L(exit_loop) movdqa (%rdi), %xmm0 movdqa 16(%rdi), %xmm2 movdqa 32(%rdi), %xmm3 movdqa 48(%rdi), %xmm4 PCMPEQ %xmm1, %xmm0 PCMPEQ %xmm1, %xmm2 PCMPEQ %xmm1, %xmm3 PCMPEQ %xmm1, %xmm4 pmaxub %xmm0, %xmm3 pmaxub %xmm2, %xmm4 pmaxub %xmm3, %xmm4 pmovmskb %xmm4, %eax add $64, %rdi test %eax, %eax jz L(align64_loop) sub $64, %rdi pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) movdqa 32(%rdi), %xmm3 PCMPEQ %xmm1, %xmm3 PCMPEQ 48(%rdi), %xmm1 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32) pmovmskb %xmm1, %eax bsf %eax, %eax lea 48(%rdi, %rax), %rax ret .p2align 4 L(exit_loop): add $32, %edx jle L(exit_loop_32) movdqa (%rdi), %xmm0 PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches) movdqa 16(%rdi), %xmm2 PCMPEQ %xmm1, %xmm2 pmovmskb %xmm2, %eax test %eax, %eax jnz L(matches16) movdqa 32(%rdi), %xmm3 PCMPEQ %xmm1, %xmm3 pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32_1) sub $16, %edx jle L(return_null) PCMPEQ 48(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax jnz L(matches48_1) xor %eax, %eax ret .p2align 4 L(exit_loop_32): add $32, %edx movdqa (%rdi), %xmm0 PCMPEQ %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches_1) sub $16, %edx jbe L(return_null) PCMPEQ 16(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax jnz L(matches16_1) xor %eax, %eax ret .p2align 4 L(matches0): bsf %eax, %eax lea -16(%rax, %rdi), %rax ret .p2align 4 L(matches): bsf %eax, %eax add %rdi, %rax ret .p2align 4 L(matches16): bsf %eax, %eax lea 16(%rax, %rdi), %rax ret .p2align 4 L(matches32): bsf %eax, %eax lea 32(%rax, %rdi), %rax ret .p2align 4 L(matches_1): bsf %eax, %eax sub %rax, %rdx jbe L(return_null) add %rdi, %rax ret .p2align 4 L(matches16_1): bsf %eax, %eax sub %rax, %rdx jbe L(return_null) lea 16(%rdi, %rax), %rax ret .p2align 4 L(matches32_1): bsf %eax, %eax sub %rax, %rdx jbe L(return_null) lea 32(%rdi, %rax), %rax ret .p2align 4 L(matches48_1): bsf %eax, %eax sub %rax, %rdx jbe L(return_null) lea 48(%rdi, %rax), %rax ret .p2align 4 L(return_null): xor %eax, %eax ret END(MEMCHR) #ifndef USE_AS_WMEMCHR strong_alias (memchr, __memchr) libc_hidden_builtin_def(memchr) #endif ``` ::: 嘗試編譯時發生以下錯誤 ``` memchr.S:16:10: fatal error: sysdep.h: No such file or directory 16 | #include <sysdep.h> | ^~~~~~~~~~ ``` 先將這一行暫時註解掉,執行以下指令 ``` gcc -c memchr.S ``` 得到以下輸出 :::spoiler ```= memchr.S: Assembler messages: memchr.S:30: Error: invalid character '(' in mnemonic memchr.S:41: Error: junk `(return_null)' after expression memchr.S:49: Error: junk `(crosscache)' after expression memchr.S:56: Error: junk `(matches_1)' after expression memchr.S:58: Error: junk `(return_null)' after expression memchr.S:64: Error: junk `(exit_loop)' after expression memchr.S:65: Error: junk `(loop_prolog)' after expression memchr.S:68: Error: invalid character '(' in mnemonic memchr.S:79: Error: junk `(unaligned_no_match)' after expression memchr.S:84: Error: junk `(return_null)' after expression memchr.S:90: Error: invalid character '(' in mnemonic memchr.S:97: Error: junk `(return_null)' after expression memchr.S:100: Error: junk `(exit_loop)' after expression memchr.S:103: Error: invalid character '(' in mnemonic memchr.S:108: Error: junk `(matches)' after expression memchr.S:114: Error: junk `(matches16)' after expression memchr.S:120: Error: junk `(matches32)' after expression memchr.S:127: Error: junk `(matches0)' after expression memchr.S:130: Error: junk `(align64_loop)' after expression memchr.S:133: Error: junk `(exit_loop)' after expression memchr.S:139: Error: junk `(matches)' after expression memchr.S:145: Error: junk `(matches16)' after expression memchr.S:151: Error: junk `(matches32)' after expression memchr.S:159: Error: junk `(matches0)' after expression memchr.S:167: Error: invalid character '(' in mnemonic memchr.S:169: Error: junk `(exit_loop)' after expression memchr.S:188: Error: junk `(align64_loop)' after expression memchr.S:194: Error: junk `(matches)' after expression memchr.S:198: Error: junk `(matches16)' after expression memchr.S:206: Error: junk `(matches32)' after expression memchr.S:214: Error: invalid character '(' in mnemonic memchr.S:216: Error: junk `(exit_loop_32)' after expression memchr.S:222: Error: junk `(matches)' after expression memchr.S:228: Error: junk `(matches16)' after expression memchr.S:234: Error: junk `(matches32_1)' after expression memchr.S:236: Error: junk `(return_null)' after expression memchr.S:241: Error: junk `(matches48_1)' after expression memchr.S:246: Error: invalid character '(' in mnemonic memchr.S:252: Error: junk `(matches_1)' after expression memchr.S:254: Error: junk `(return_null)' after expression memchr.S:259: Error: junk `(matches16_1)' after expression memchr.S:264: Error: invalid character '(' in mnemonic memchr.S:270: Error: invalid character '(' in mnemonic memchr.S:276: Error: invalid character '(' in mnemonic memchr.S:282: Error: invalid character '(' in mnemonic memchr.S:288: Error: invalid character '(' in mnemonic memchr.S:291: Error: junk `(return_null)' after expression memchr.S:296: Error: invalid character '(' in mnemonic memchr.S:299: Error: junk `(return_null)' after expression memchr.S:304: Error: invalid character '(' in mnemonic memchr.S:307: Error: junk `(return_null)' after expression memchr.S:312: Error: invalid character '(' in mnemonic memchr.S:315: Error: junk `(return_null)' after expression memchr.S:320: Error: invalid character '(' in mnemonic memchr.S:323: Error: invalid character '(' in mnemonic memchr.S:326: Error: no such instruction: `strong_alias (memchr,__memchr)' memchr.S:327: Error: no such instruction: `libc_hidden_builtin_def(memchr)' ``` ::: 上方的 error 大多都是格式上的錯誤,參考 [baby-steps-in-assembly](https://github.com/majek/baby-steps-in-assembly) 將組合語言稍加修改,引入 `macro.h` ,並將此 `memchr` define 為 `memchr1` ,方便在主程式 `test.c` 呼叫。部分擷取如下: ``` //#include "sysdep.h" #include "macros.h" #ifdef USE_AS_WMEMCHR # define MEMCHR wmemchr # define PCMPEQ pcmpeqd #else # define MEMCHR1 memchr1 # define PCMPEQ pcmpeqb #endif /* fast SSE2 version with using pmaxub and 64 byte loop */ ENTRY(MEMCHR1) movd %esi, %xmm1 mov %edi, %ecx ``` 在主程式也要宣告相對應之 function ```c void *memchr1(const void *cs, int c, size_t count); ``` 將所有 label 的格式修改,如: `L(return_null)` 改成 `L_return_null`。最後幾行的 `#ifndef` 註解掉使編譯能通過。 執行以下命令編譯 `test.c` 並與 `memchr.S` link 起來 ``` gcc -m64 -g -ggdb -Wextra test.c memchr.S -o a.out ``` 執行測驗一之測試程式的結果如下 ``` $ ./a.out String after |.| is - |.csie.ncku.edu.tw| ``` ### 效能比較 我發現先前分析之程式碼有錯誤,`memchr_opt` 前面使用之 `usleep()` 太長,導致測量的時間較長,低估了效率。因此我將每一個 `memchr` 測試統一延遲 1 個 $\mu s$ ,這樣的結果會比較公平,受到的影響皆相同。 我也參考 [fibdrv 的 效能分析提示](https://hackmd.io/@sysprog/linux2022-fibdrv#-Linux-效能分析的提示) 將測試程式指定在一個 CPU 上,排除干擾效能分析的因素,在我筆電的雙系統上測試,得到下圖: ![](https://i.imgur.com/8S0LYKN.png) 由此圖很明顯可看出效能由高到低為 `glibc` > `memchr-opt` > `memchr-x86` > `original` 。 比較出人意外的是 `memchr_opt` 比 `memchr-x86` 快,回顧 `memchr-x86` 程式碼, ```c void *memchr_x86(const void *cs, int c, size_t count) { int d0; void *res; if (!count) return NULL; asm volatile("repne\n\t" "scasb\n\t" "je 1f\n\t" "mov $1,%0\n" "1:\tdec %0" : "=D" (res), "=&c" (d0) : "a" (c), "0" (cs), "1" (count) : "memory"); return res; } ``` 根據 [GCC-Inline-Assembly-HOWTO](http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html) 以及 [Extended Asm - Assembler Instructions with C Expression Operands](https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html) , Inline Assembly 之架構如下: ```c asm ( assembler template : output operands /* optional */ : input operands /* optional */ : list of clobbered registers /* optional */ ); ``` output operands 和 input operands 放置 register 對應的變數,會用 `%數字` 從 output operands 編號,以 `memchr_x86` 為例, `%0` 即是 `"=D" (res)` , `%1` 為 `"=&c" (d0)` ,若 output 和 input 為相同 register ,則會出現 `"0" (cs)` 的型態,表示 `cs` 和編號 0 , 即`"=D" (res)` 同一個 register 。 在這段組合語言中,變數對應的 register 如下: | Parameter | Register | |:---------:|:--------:| | cs | EDI | | c | AL(EAX) | | count | ECX | | d0 | ECX | | res | EDI | 根據 [Intel Architecture Software Developer’s Manual Volume 2: Instruction Set Reference](https://www.cs.cmu.edu/~410/doc/intel-isr.pdf) ,在第 365 頁,第一個指令 `repne` (repeat while not equal) 是和 string 操作相關,其執行條件如下: | Repeat Prefix | Termination Condition 1 | Termination Condition 2 | |:-------------:|:-----------------------:|:-----------------------:| | REPNE/REPNZ | ECX=0 | ZF=1 | * ZF 為 Zero Flag * ECX 為 Counter for string and loop operations , 參見[Intel® 64 and IA-32 Architectures Developer's Manual: Vol. 1](https://www.intel.com.tw/content/www/tw/zh/architecture-and-technology/64-ia-32-architectures-software-developer-vol-1-manual.html) ,每跑一次 `repne`, ECX 減一。 換而言之, `repne` 的作用類似 `while` 迴圈,大略可對應 [Linux 核心原本 (與平台無關)](https://github.com/torvalds/linux/blob/master/lib/string.c#L892) 中的 `while (n-- != 0)`。 `scasb` 是用來比較從記憶體搬來的 1 個 `byte` 和暫存器 `AL` 的值是否相等。根據結果設定對應之EFLAGS register 中的 status flag ,應該是 Zero Flag 。比較完後,會依據 EFLAGS register 的 DF Flags 將 EDI register 加一或減一。這裡沒有特別設定,為加一。 當符合 `repne` 結束條件,即 `ZF = 1` 或走完整個字串後,je 會判斷 ZF 的值。其功能為 `Jump short if equal (ZF=1)` ,因此有找到 match 之字元,會跳至 label 1 ,用 `dec` 將 `res` 減一,因為找到 match 字串後 `EDI register` 會自動加一。 反之,`ZF = 0` ,則會將 1 放入 `res` ,後面的 `dec` 將 `res` 減一得 0 ,即 `NULL`。 由以上說明可見 `memchr_x86` 其實就是 [Linux 核心原本 (與平台無關)](https://github.com/torvalds/linux/blob/master/lib/string.c#L892) 換成組合語言,我將這兩個 `function` 的 assmebly 列出,如下: `gcc -S test.c` :::spoiler memchr_x86 ```asm memchr_x86: .LFB0: .cfi_startproc endbr64 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movq %rdi, -24(%rbp) movl %esi, -28(%rbp) movq %rdx, -40(%rbp) cmpq $0, -40(%rbp) jne .L2 movl $0, %eax jmp .L3 .L2: movl -28(%rbp), %eax movq -24(%rbp), %rdx movq -40(%rbp), %rcx movq %rdx, %rdi #APP # 9 "test.c" 1 repne scasb je 1f mov $1,%rdi 1: dec %rdi # 0 "" 2 #NO_APP movl %ecx, %eax movq %rdi, %rdx movq %rdx, -8(%rbp) movl %eax, -12(%rbp) movq -8(%rbp), %rax .L3: popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE0: .size memchr_x86, .-memchr_x86 ``` ::: :::spoiler memchr_linux ```asm memchr_linux: .LFB1: .cfi_startproc endbr64 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movq %rdi, -24(%rbp) movl %esi, -28(%rbp) movq %rdx, -40(%rbp) movq -24(%rbp), %rax movq %rax, -8(%rbp) jmp .L5 .L7: movl -28(%rbp), %eax movl %eax, %ecx movq -8(%rbp), %rax leaq 1(%rax), %rdx movq %rdx, -8(%rbp) movzbl (%rax), %eax cmpb %al, %cl jne .L5 movq -8(%rbp), %rax subq $1, %rax jmp .L6 .L5: movq -40(%rbp), %rax leaq -1(%rax), %rdx movq %rdx, -40(%rbp) testq %rax, %rax jne .L7 movl $0, %eax .L6: popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE1: .size memchr_linux, .-memchr_linux .section .rodata ``` ::: 可看出 `memchr_x86` 在比較字串方面比 `memchr_linux` 少不少,推測這應該是效能較佳的原因。 #### memchr_opt 和 glibc memchr.c 下圖為 `memchr_opt` 和 `glibc-memchr.c` 的執行時間比較,測量方式如 [memchr 與 strchr 比較](https://hackmd.io/@arthur-chang/linux2022-quiz8#memchr-和-strchr-比較) 所述,有趣的是兩者執行的時間幾乎相同。 ![](https://i.imgur.com/UJxRLTq.png) 接著觀察 [`glibc-memchr.c`](https://github.com/lattera/glibc/blob/master/string/memchr.c) 的程式碼,如下: :::spoiler ```c= void *MEMCHR (void const *s, int c_in, size_t n) { /* On 32-bit hardware, choosing longword to be a 32-bit unsigned long instead of a 64-bit uintmax_t tends to give better performance. On 64-bit hardware, unsigned long is generally 64 bits already. Change this typedef to experiment with performance. */ typedef unsigned long int longword; const unsigned char *char_ptr; const longword *longword_ptr; longword repeated_one; longword repeated_c; unsigned char c; c = (unsigned char) c_in; /* Handle the first few bytes by reading one byte at a time. Do this until CHAR_PTR is aligned on a longword boundary. */ for (char_ptr = (const unsigned char *) s; n > 0 && (size_t) char_ptr % sizeof (longword) != 0; --n, ++char_ptr) if (*char_ptr == c) return (void *) char_ptr; longword_ptr = (const longword *) char_ptr; /* All these elucidatory comments refer to 4-byte longwords, but the theory applies equally well to any size longwords. */ /* Compute auxiliary longword values: repeated_one is a value which has a 1 in every byte. repeated_c has c in every byte. */ repeated_one = 0x01010101; repeated_c = c | (c << 8); repeated_c |= repeated_c << 16; if (0xffffffffU < (longword) -1) { repeated_one |= repeated_one << 31 << 1; repeated_c |= repeated_c << 31 << 1; if (8 < sizeof (longword)) { size_t i; for (i = 64; i < sizeof (longword) * 8; i *= 2) { repeated_one |= repeated_one << i; repeated_c |= repeated_c << i; } } } /* Instead of the traditional loop which tests each byte, we will test a longword at a time. The tricky part is testing if *any of the four* bytes in the longword in question are equal to c. We first use an xor with repeated_c. This reduces the task to testing whether *any of the four* bytes in longword1 is zero. We compute tmp = ((longword1 - repeated_one) & ~longword1) & (repeated_one << 7). That is, we perform the following operations: 1. Subtract repeated_one. 2. & ~longword1. 3. & a mask consisting of 0x80 in every byte. Consider what happens in each byte: - If a byte of longword1 is zero, step 1 and 2 transform it into 0xff, and step 3 transforms it into 0x80. A carry can also be propagated to more significant bytes. - If a byte of longword1 is nonzero, let its lowest 1 bit be at position k (0 <= k <= 7); so the lowest k bits are 0. After step 1, the byte ends in a single bit of value 0 and k bits of value 1. After step 2, the result is just k bits of value 1: 2^k - 1. After step 3, the result is 0. And no carry is produced. So, if longword1 has only non-zero bytes, tmp is zero. Whereas if longword1 has a zero byte, call j the position of the least significant zero byte. Then the result has a zero at positions 0, ..., j-1 and a 0x80 at position j. We cannot predict the result at the more significant bytes (positions j+1..3), but it does not matter since we already have a non-zero bit at position 8*j+7. So, the test whether any byte in longword1 is zero is equivalent to testing whether tmp is nonzero. */ while (n >= sizeof (longword)) { longword longword1 = *longword_ptr ^ repeated_c; if ((((longword1 - repeated_one) & ~longword1) & (repeated_one << 7)) != 0) break; longword_ptr++; n -= sizeof (longword); } char_ptr = (const unsigned char *) longword_ptr; /* At this point, we know that either n < sizeof (longword), or one of the sizeof (longword) bytes starting at char_ptr is == c. On little-endian machines, we could determine the first such byte without any further memory accesses, just by looking at the tmp result from the last loop iteration. But this does not work on big-endian machines. Choose code that works in both cases. */ for (; n > 0; --n, ++char_ptr) { if (*char_ptr == c) return (void *) char_ptr; } return NULL; } ``` ::: 其程式碼之想法和 `memchr_opt` 一樣,都是採用 64 bits 之 mask 來尋找目標字元。程式碼一開始同樣是對字串做 alignment,接著用相近的方式做出 64 bits 之 mask ,比較 64 bits 之方法同樣採用 xor 等等。 `memchr_opt` 使採用 `DETECT_CHAR` 來包裝這樣的操作。總而言之,這兩者是用不同的方式實作相同的想法,因此執行時間上也幾乎一樣。 ### memchr 和 strchr 比較 `strchr` 的 Linux 組合語言版本尚有問題須解決,如下面所示,先列出其他之比較。 ``` Error: unsupported instruction `mov' ``` 效能分析之模式參考 https://gist.github.com/nico/2624336 ,測量方式依舊使用 `struct timespec` ,測量執行兩百次之時間再除以兩百得到執行一次之時間。 ![](https://i.imgur.com/IJ46AnC.png) ![](https://i.imgur.com/86B3zcr.png) ![](https://i.imgur.com/rAMH3Wy.png) 下圖是 `glibc-memchr.S` 和直接呼叫 `<string.h>` 中 `memchr` 的比較,兩者的執行時間皆是所有 `memchr` 版本中前二低,推測 `<string.h>` 中 `memchr` 應該是 `glibc-memchr.S` 的實作。 ![](https://i.imgur.com/PI4NQB2.png) 接著比較 `memchrr` 與 `strchr` 之差別, ![](https://i.imgur.com/rOWK22H.png) 由上圖可見,`memchr-linux` 比 `strchr-linux` 還要快,到 `glibc-memchr.c` 和 `glibc-strchr.c` 時依舊有領先,但比到 `default` 的部分,如下圖,`strchr` 就比 `memchr` 還快。 ![](https://i.imgur.com/GiQsydi.png) 組合語言的部分也是如此 ![](https://i.imgur.com/4UXH6A0.png)