Try   HackMD

2022q1 Homework5 (quiz8) answer

contribute by < arthurchang09 >

測驗一

void *memchr_opt(const void *src_void, int c, size_t length) { const unsigned char *src = (const unsigned char *) src_void; unsigned char d = c; while (UNALIGNED(src)) { if (!length--) return NULL; if (*src == d) return (void *) src; src++; } if (!TOO_SMALL(length)) { /* If we get this far, we know that length is large and * src is word-aligned. */ /* The fast code reads the source one word at a time and only performs * the bytewise search on word-sized segments if they contain the search * character, which is detected by XORing the word-sized segment with a * word-sized block of the search character and then detecting for the * presence of NULL in the result. */ unsigned long *asrc = (unsigned long *) src; unsigned long mask = d << 8 | d; mask = mask << 16 | mask; for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1) mask = (mask << i) | mask; while (length >= LBLOCKSIZE) { /* XXXXX: Your implementation should appear here */ /* detect if there is a word that we want to find*/ if ( !DETECT_CHAR(*asrc, mask)) { /*if not, move to next 4byte*/ asrc += 1; length -= LBLOCKSIZE; } else { /*if we find the word, break the loop*/ break; } } /* If there are fewer than LBLOCKSIZE characters left, then we resort to * the bytewise loop. */ src = (unsigned char *) asrc; } while (length--) { if (*src == d) return (void *) src; src++; } return NULL; }

第 5 行的 while 是確認字串是否對齊,若沒對齊則跳過這部分的輸入。

while (UNALIGNED(src)) {
        if (!length--)
            return NULL;
        if (*src == d)
            return (void *) src;
        src++;
    }

接著,若字串長度大於 long ,在 x86_64 處理器上資料寬度為 64 bits ,即 8 個字元,會進入第 12 行的 if 中。
下方的程式碼將要尋找的字元變成 64 bits 的 mask ,用來尋找 64 bits 中 是否有目標字元。假設要尋找 . 這個字元,其 16 進位為 0x2e,經過下方程式,mask 會變成 0x2e2e2e2e2e2e2e2e

        unsigned long mask = d << 8 | d;
        mask = mask << 16 | mask;
        for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
            mask = (mask << i) | mask;

在 28 行透過 DETECT_CHAR 判斷這 64 bits 是否存在目標字元,接著第 48 行再用 while 從這 64 bits 找到第一個目標字元並 return。 若都未找到,則回傳 NULL

比較 Linux 核心原本 (與平台無關) 的實作、 memchr_opt

Linux 核心原本 (與平台無關) 之程式碼如下

void *memchr(const void *s, int c, size_t n)
{
    const unsigned char *p = s;
    while (n-- != 0) {
        if ((unsigned char)c == *p++) {
            return (void *)(p - 1);
        }
    }
    return NULL;
}

其原理不複雜,使用 while 迴圈一個個走訪每一個字元,直到找到第一個目標字元。

x86 平台有以下 inline assembly

void *memchr(const void *cs, int c, size_t count) { int d0; void *res; if (!count) return NULL; asm volatile("repne\n\t" "scasb\n\t" "je 1f\n\t" "movl $1,%0\n" "1:\tdecl %0" : "=D" (res), "=&c" (d0) : "a" (c), "0" (cs), "1" (count) : "memory"); return res; }

然而編譯時出現錯誤

test.c: Assembler messages:
test.c:112: Error: incorrect register `%rdi' used with `l' suffix
test.c:113: Error: incorrect register `%rdi' used with `l' suffix

根據 AT&T Syntax versus Intel Syntaxmovldeclllong (32 bits),依據 LP64,long 的資料寬度為 64 bits ,因此將上方程式碼稍作修改,將 movl decl 改成 mov decmovq decqq 為 quadruple word (64-bit) ,如下:

void *memchr_x86(const void *cs, int c, size_t count)
{
	int d0;
	void *res;
	if (!count)
		return NULL;
	asm volatile("repne\n\t"
		"scasb\n\t"
		"je 1f\n\t"
		"mov $1,%0\n"
		"1:\tdec %0"
		: "=D" (res), "=&c" (d0)
		: "a" (c), "0" (cs), "1" (count)
		: "memory");
	return res;
}

接著,我測試三者尋找目標字元所花費的時間。我宣告一個長度為 1000 的字元陣列,每個元素初始化為字元 0 ,目標字元 4 會逐一放在第 0 個字元、第 1 個字元直到第 998 個字元,第 999 字元為 \0。採用 struct timespec 測量找到目標字元的時間。總共測量 120 次時間並去掉最小和最大各 10 個數值。

將測試程式碼移到 GitHub 或 gist 上,讓開發紀錄更簡潔。

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
jserv

以下為測試程式碼:

Full Code
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <stddef.h>
#include <limits.h>
#include <time.h>
#include <unistd.h>

/* Nonzero if either X or Y is not aligned on a "long" boundary */
#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))

/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))

/* Threshhold for punting to the bytewise iterator */
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)

#if LONG_MAX == 2147483647L
#define DETECT_NULL(X) (((X) -0x01010101) & ~(X) & 0x80808080)
#else
#if LONG_MAX == 9223372036854775807L
/* Nonzero if X (a long int) contains a NULL byte. */
#define DETECT_NULL(X) (((X) -0x0101010101010101) & ~(X) & 0x8080808080808080)
#else
#error long int is not a 32bit or 64bit type.
#endif
#endif

/* @return nonzero if (long)X contains the byte used to fill MASK. */
#define DETECT_CHAR(X, MASK) (DETECT_NULL(X ^ MASK))

#define LENGTH 120
#define CUT 10
#define STRLEN 1000

void *memchr_org(const void *s, int c, size_t n)
{
    const unsigned char *p = s;
    while (n-- != 0) {
        if ((unsigned char)c == *p++) {
            return (void *)(p - 1);
        }
    }
    return NULL;
}


void *memchr_opt(const void *src_void, int c, size_t length)
{
    const unsigned char *src = (const unsigned char *) src_void;
    unsigned char d = c;
    while (UNALIGNED(src)) {
        if (!length--)
            return NULL;
        if (*src == d)
            return (void *) src;
        src++;
    }
    if (!TOO_SMALL(length)) {
        /* If we get this far, we know that length is large and
         * src is word-aligned.
         */

        /* The fast code reads the source one word at a time and only performs
         * the bytewise search on word-sized segments if they contain the search
         * character, which is detected by XORing the word-sized segment with a
         * word-sized block of the search character and then detecting for the
         * presence of NULL in the result.
         */
        unsigned long *asrc = (unsigned long *) src;
        unsigned long mask = d << 8 | d;
        mask = mask << 16 | mask;
        for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
            mask = (mask << i) | mask;
        int index = 0;
        while (length >= LBLOCKSIZE) {
            /* XXXXX: Your implementation should appear here */
            if ( !DETECT_CHAR(*asrc, mask)) {
                asrc += 1;
                length -= LBLOCKSIZE;
            }
            else
                break;
        }

        /* If there are fewer than LBLOCKSIZE characters left, then we resort to
         * the bytewise loop.
         */
        src = (unsigned char *) asrc;
    }

    while (length--) {
        if (*src == d)
            return (void *) src;
        src++;
    }
    return NULL;
}

void *memchr_x86(const void *cs, int c, size_t count)
{
	int d0;
	void *res;
	if (!count)
		return NULL;
	asm volatile("repne\n\t"
		"scasb\n\t"
		"je 1f\n\t"
		"mov $1,%0\n"
		"1:\tdec %0"
		: "=D" (res), "=&c" (d0)
		: "a" (c), "0" (cs), "1" (count)
		: "memory");
	return res;
}

int cmp ( const void *a , const void *b ) 
{ 
    return *(int *)a > *(int *)b;
}

int main()
{   
    char str[STRLEN];
    memset(str,'0', sizeof(char)*STRLEN);
    str[STRLEN - 1] = '\0';
    char ch = '4';


    int data[3][LENGTH];
    memset(data,0,sizeof(data));
    int sum[3];
    memset(sum,0,sizeof(sum));

    struct timespec t1,t2,t3,t4,t5,t6;

    for (int k = 0; k < STRLEN - 1; ++k){
        memset(str,'0', sizeof(char)*STRLEN);
        str[STRLEN - 1] = '\0';
        str[k] = '4';
        memset(data,0,sizeof(data));
        memset(sum,0,sizeof(sum));
        for (int i = 0; i < LENGTH; ++i) {
            clock_gettime(CLOCK_REALTIME, &t1);
            char *ret = memchr_opt(str, ch, strlen(str));
            clock_gettime(CLOCK_REALTIME, &t2);
            data[0][i] = t2.tv_nsec - t1.tv_nsec;

            clock_gettime(CLOCK_REALTIME, &t3);
            ret = memchr_x86(str, ch, strlen(str));
            clock_gettime(CLOCK_REALTIME, &t4);
            data[1][i] = t4.tv_nsec - t3.tv_nsec;

            clock_gettime(CLOCK_REALTIME, &t5);
            memchr_org(str, ch, strlen(str));
            clock_gettime(CLOCK_REALTIME, &t6);
            data[2][i] = t6.tv_nsec - t5.tv_nsec;

            usleep(1500);
        }
        printf("%d ",k);
        for (int i = 0; i < 3; ++i) {
            qsort(data[i],LENGTH,sizeof(int),cmp);
            for (int j = CUT; j < LENGTH - CUT; ++j) {
                sum[i] += data[i][j];
            }
            printf("%d ", sum[i] / (LENGTH - (CUT << 1)));
        }
        printf("\n");

    }
    return 0;
}

下圖為在虛擬機上的結果:

TODO: 比對 glibc 的 memchr 實作: https://sourceware.org/git/?p=glibc.git;a=tree;f=sysdeps/x86_64

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
jserv

引入 glibc 之 memchr.S

glibc 之 memchr.S如下:

/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
   Contributed by Intel Corporation.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include <sysdep.h>

#ifdef USE_AS_WMEMCHR
# define MEMCHR		wmemchr
# define PCMPEQ		pcmpeqd
#else
# define MEMCHR		memchr
# define PCMPEQ		pcmpeqb
#endif

/* fast SSE2 version with using pmaxub and 64 byte loop */

	.text
ENTRY(MEMCHR)
	movd	%esi, %xmm1
	mov	%edi, %ecx

#ifdef USE_AS_WMEMCHR
	test	%rdx, %rdx
	jz	L(return_null)
	shl	$2, %rdx
#else
	punpcklbw %xmm1, %xmm1
	test	%rdx, %rdx
	jz	L(return_null)
	punpcklbw %xmm1, %xmm1
#endif

	and	$63, %ecx
	pshufd	$0, %xmm1, %xmm1

	cmp	$48, %ecx
	ja	L(crosscache)

	movdqu	(%rdi), %xmm0
	PCMPEQ	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax

	jnz	L(matches_1)
	sub	$16, %rdx
	jbe	L(return_null)
	add	$16, %rdi
	and	$15, %ecx
	and	$-16, %rdi
	add	%rcx, %rdx
	sub	$64, %rdx
	jbe	L(exit_loop)
	jmp	L(loop_prolog)

	.p2align 4
L(crosscache):
	and	$15, %ecx
	and	$-16, %rdi
	movdqa	(%rdi), %xmm0

	PCMPEQ	%xmm1, %xmm0
/* Check if there is a match.  */
	pmovmskb %xmm0, %eax
/* Remove the leading bytes.  */
	sar	%cl, %eax
	test	%eax, %eax
	je	L(unaligned_no_match)
/* Check which byte is a match.  */
	bsf	%eax, %eax

	sub	%rax, %rdx
	jbe	L(return_null)
	add	%rdi, %rax
	add	%rcx, %rax
	ret

	.p2align 4
L(unaligned_no_match):
        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
	   possible addition overflow.  */
	neg	%rcx
	add	$16, %rcx
	sub	%rcx, %rdx
	jbe	L(return_null)
	add	$16, %rdi
	sub	$64, %rdx
	jbe	L(exit_loop)

	.p2align 4
L(loop_prolog):
	movdqa	(%rdi), %xmm0
	PCMPEQ	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L(matches)

	movdqa	16(%rdi), %xmm2
	PCMPEQ	%xmm1, %xmm2
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L(matches16)

	movdqa	32(%rdi), %xmm3
	PCMPEQ	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L(matches32)

	movdqa	48(%rdi), %xmm4
	PCMPEQ	%xmm1, %xmm4
	add	$64, %rdi
	pmovmskb %xmm4, %eax
	test	%eax, %eax
	jnz	L(matches0)

	test	$0x3f, %rdi
	jz	L(align64_loop)

	sub	$64, %rdx
	jbe	L(exit_loop)

	movdqa	(%rdi), %xmm0
	PCMPEQ	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L(matches)

	movdqa	16(%rdi), %xmm2
	PCMPEQ	%xmm1, %xmm2
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L(matches16)

	movdqa	32(%rdi), %xmm3
	PCMPEQ	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L(matches32)

	movdqa	48(%rdi), %xmm3
	PCMPEQ	%xmm1, %xmm3
	pmovmskb %xmm3, %eax

	add	$64, %rdi
	test	%eax, %eax
	jnz	L(matches0)

	mov	%rdi, %rcx
	and	$-64, %rdi
	and	$63, %ecx
	add	%rcx, %rdx

	.p2align 4
L(align64_loop):
	sub	$64, %rdx
	jbe	L(exit_loop)
	movdqa	(%rdi), %xmm0
	movdqa	16(%rdi), %xmm2
	movdqa	32(%rdi), %xmm3
	movdqa	48(%rdi), %xmm4

	PCMPEQ	%xmm1, %xmm0
	PCMPEQ	%xmm1, %xmm2
	PCMPEQ	%xmm1, %xmm3
	PCMPEQ	%xmm1, %xmm4

	pmaxub	%xmm0, %xmm3
	pmaxub	%xmm2, %xmm4
	pmaxub	%xmm3, %xmm4
	pmovmskb %xmm4, %eax

	add	$64, %rdi

	test	%eax, %eax
	jz	L(align64_loop)

	sub	$64, %rdi

	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L(matches)

	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L(matches16)

	movdqa	32(%rdi), %xmm3
	PCMPEQ	%xmm1, %xmm3

	PCMPEQ	48(%rdi), %xmm1
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L(matches32)

	pmovmskb %xmm1, %eax
	bsf	%eax, %eax
	lea	48(%rdi, %rax), %rax
	ret

	.p2align 4
L(exit_loop):
	add	$32, %edx
	jle	L(exit_loop_32)

	movdqa	(%rdi), %xmm0
	PCMPEQ	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L(matches)

	movdqa	16(%rdi), %xmm2
	PCMPEQ	%xmm1, %xmm2
	pmovmskb %xmm2, %eax
	test	%eax, %eax
	jnz	L(matches16)

	movdqa	32(%rdi), %xmm3
	PCMPEQ	%xmm1, %xmm3
	pmovmskb %xmm3, %eax
	test	%eax, %eax
	jnz	L(matches32_1)
	sub	$16, %edx
	jle	L(return_null)

	PCMPEQ	48(%rdi), %xmm1
	pmovmskb %xmm1, %eax
	test	%eax, %eax
	jnz	L(matches48_1)
	xor	%eax, %eax
	ret

	.p2align 4
L(exit_loop_32):
	add	$32, %edx
	movdqa	(%rdi), %xmm0
	PCMPEQ	%xmm1, %xmm0
	pmovmskb %xmm0, %eax
	test	%eax, %eax
	jnz	L(matches_1)
	sub	$16, %edx
	jbe	L(return_null)

	PCMPEQ	16(%rdi), %xmm1
	pmovmskb %xmm1, %eax
	test	%eax, %eax
	jnz	L(matches16_1)
	xor	%eax, %eax
	ret

	.p2align 4
L(matches0):
	bsf	%eax, %eax
	lea	-16(%rax, %rdi), %rax
	ret

	.p2align 4
L(matches):
	bsf	%eax, %eax
	add	%rdi, %rax
	ret

	.p2align 4
L(matches16):
	bsf	%eax, %eax
	lea	16(%rax, %rdi), %rax
	ret

	.p2align 4
L(matches32):
	bsf	%eax, %eax
	lea	32(%rax, %rdi), %rax
	ret

	.p2align 4
L(matches_1):
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L(return_null)
	add	%rdi, %rax
	ret

	.p2align 4
L(matches16_1):
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L(return_null)
	lea	16(%rdi, %rax), %rax
	ret

	.p2align 4
L(matches32_1):
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L(return_null)
	lea	32(%rdi, %rax), %rax
	ret

	.p2align 4
L(matches48_1):
	bsf	%eax, %eax
	sub	%rax, %rdx
	jbe	L(return_null)
	lea	48(%rdi, %rax), %rax
	ret

	.p2align 4
L(return_null):
	xor	%eax, %eax
	ret
END(MEMCHR)

#ifndef USE_AS_WMEMCHR
strong_alias (memchr, __memchr)
libc_hidden_builtin_def(memchr)
#endif

嘗試編譯時發生以下錯誤

memchr.S:16:10: fatal error: sysdep.h: No such file or directory
   16 | #include <sysdep.h>
      |          ^~~~~~~~~~

先將這一行暫時註解掉,執行以下指令

gcc -c memchr.S

得到以下輸出

memchr.S: Assembler messages: memchr.S:30: Error: invalid character '(' in mnemonic memchr.S:41: Error: junk `(return_null)' after expression memchr.S:49: Error: junk `(crosscache)' after expression memchr.S:56: Error: junk `(matches_1)' after expression memchr.S:58: Error: junk `(return_null)' after expression memchr.S:64: Error: junk `(exit_loop)' after expression memchr.S:65: Error: junk `(loop_prolog)' after expression memchr.S:68: Error: invalid character '(' in mnemonic memchr.S:79: Error: junk `(unaligned_no_match)' after expression memchr.S:84: Error: junk `(return_null)' after expression memchr.S:90: Error: invalid character '(' in mnemonic memchr.S:97: Error: junk `(return_null)' after expression memchr.S:100: Error: junk `(exit_loop)' after expression memchr.S:103: Error: invalid character '(' in mnemonic memchr.S:108: Error: junk `(matches)' after expression memchr.S:114: Error: junk `(matches16)' after expression memchr.S:120: Error: junk `(matches32)' after expression memchr.S:127: Error: junk `(matches0)' after expression memchr.S:130: Error: junk `(align64_loop)' after expression memchr.S:133: Error: junk `(exit_loop)' after expression memchr.S:139: Error: junk `(matches)' after expression memchr.S:145: Error: junk `(matches16)' after expression memchr.S:151: Error: junk `(matches32)' after expression memchr.S:159: Error: junk `(matches0)' after expression memchr.S:167: Error: invalid character '(' in mnemonic memchr.S:169: Error: junk `(exit_loop)' after expression memchr.S:188: Error: junk `(align64_loop)' after expression memchr.S:194: Error: junk `(matches)' after expression memchr.S:198: Error: junk `(matches16)' after expression memchr.S:206: Error: junk `(matches32)' after expression memchr.S:214: Error: invalid character '(' in mnemonic memchr.S:216: Error: junk `(exit_loop_32)' after expression memchr.S:222: Error: junk `(matches)' after expression memchr.S:228: Error: junk `(matches16)' after expression memchr.S:234: Error: junk `(matches32_1)' after expression memchr.S:236: Error: junk `(return_null)' after expression memchr.S:241: Error: junk `(matches48_1)' after expression memchr.S:246: Error: invalid character '(' in mnemonic memchr.S:252: Error: junk `(matches_1)' after expression memchr.S:254: Error: junk `(return_null)' after expression memchr.S:259: Error: junk `(matches16_1)' after expression memchr.S:264: Error: invalid character '(' in mnemonic memchr.S:270: Error: invalid character '(' in mnemonic memchr.S:276: Error: invalid character '(' in mnemonic memchr.S:282: Error: invalid character '(' in mnemonic memchr.S:288: Error: invalid character '(' in mnemonic memchr.S:291: Error: junk `(return_null)' after expression memchr.S:296: Error: invalid character '(' in mnemonic memchr.S:299: Error: junk `(return_null)' after expression memchr.S:304: Error: invalid character '(' in mnemonic memchr.S:307: Error: junk `(return_null)' after expression memchr.S:312: Error: invalid character '(' in mnemonic memchr.S:315: Error: junk `(return_null)' after expression memchr.S:320: Error: invalid character '(' in mnemonic memchr.S:323: Error: invalid character '(' in mnemonic memchr.S:326: Error: no such instruction: `strong_alias (memchr,__memchr)' memchr.S:327: Error: no such instruction: `libc_hidden_builtin_def(memchr)'

上方的 error 大多都是格式上的錯誤,參考 baby-steps-in-assembly 將組合語言稍加修改,引入 macro.h ,並將此 memchr define 為 memchr1 ,方便在主程式 test.c 呼叫。部分擷取如下:

//#include "sysdep.h"
#include "macros.h"

#ifdef USE_AS_WMEMCHR
# define MEMCHR		wmemchr
# define PCMPEQ		pcmpeqd
#else
# define MEMCHR1		memchr1
# define PCMPEQ		pcmpeqb
#endif

/* fast SSE2 version with using pmaxub and 64 byte loop */


ENTRY(MEMCHR1)
	movd	%esi, %xmm1
	mov	%edi, %ecx

在主程式也要宣告相對應之 function

void *memchr1(const void *cs, int c, size_t count);

將所有 label 的格式修改,如: L(return_null) 改成 L_return_null。最後幾行的 #ifndef 註解掉使編譯能通過。

執行以下命令編譯 test.c 並與 memchr.S link 起來

gcc -m64 -g -ggdb -Wextra test.c memchr.S -o a.out

執行測驗一之測試程式的結果如下

$ ./a.out
String after |.| is - |.csie.ncku.edu.tw|

效能比較

我發現先前分析之程式碼有錯誤,memchr_opt 前面使用之 usleep() 太長,導致測量的時間較長,低估了效率。因此我將每一個 memchr 測試統一延遲 1 個

μs ,這樣的結果會比較公平,受到的影響皆相同。

我也參考 fibdrv 的 效能分析提示 將測試程式指定在一個 CPU 上,排除干擾效能分析的因素,在我筆電的雙系統上測試,得到下圖:

由此圖很明顯可看出效能由高到低為 glibc > memchr-opt > memchr-x86 > original

比較出人意外的是 memchr_optmemchr-x86 快,回顧 memchr-x86 程式碼,

void *memchr_x86(const void *cs, int c, size_t count)
{
	int d0;
	void *res;
	if (!count)
		return NULL;
	asm volatile("repne\n\t"
		"scasb\n\t"
		"je 1f\n\t"
		"mov $1,%0\n"
		"1:\tdec %0"
		: "=D" (res), "=&c" (d0)
		: "a" (c), "0" (cs), "1" (count)
		: "memory");
	return res;
}

根據 GCC-Inline-Assembly-HOWTO 以及 Extended Asm - Assembler Instructions with C Expression Operands , Inline Assembly 之架構如下:

    asm ( assembler template 
           : output operands                  /* optional */
           : input operands                   /* optional */
           : list of clobbered registers      /* optional */
           );

output operands 和 input operands 放置 register 對應的變數,會用 %數字 從 output operands 編號,以 memchr_x86 為例, %0 即是 "=D" (res)%1"=&c" (d0) ,若 output 和 input 為相同 register ,則會出現 "0" (cs) 的型態,表示 cs 和編號 0 , 即"=D" (res) 同一個 register 。

在這段組合語言中,變數對應的 register 如下:

Parameter Register
cs EDI
c AL(EAX)
count ECX
d0 ECX
res EDI

根據 Intel Architecture Software Developer’s Manual Volume 2:
Instruction Set Reference
,在第 365 頁,第一個指令 repne (repeat while not equal) 是和 string 操作相關,其執行條件如下:

Repeat Prefix Termination Condition 1 Termination Condition 2
REPNE/REPNZ ECX=0 ZF=1

換而言之, repne 的作用類似 while 迴圈,大略可對應 Linux 核心原本 (與平台無關) 中的 while (n-- != 0)

scasb 是用來比較從記憶體搬來的 1 個 byte 和暫存器 AL 的值是否相等。根據結果設定對應之EFLAGS register 中的 status flag ,應該是 Zero Flag 。比較完後,會依據 EFLAGS register 的 DF Flags 將 EDI register 加一或減一。這裡沒有特別設定,為加一。

當符合 repne 結束條件,即 ZF = 1 或走完整個字串後,je 會判斷 ZF 的值。其功能為 Jump short if equal (ZF=1) ,因此有找到 match 之字元,會跳至 label 1 ,用 decres 減一,因為找到 match 字串後 EDI register 會自動加一。

反之,ZF = 0 ,則會將 1 放入 res ,後面的 decres 減一得 0 ,即 NULL

由以上說明可見 memchr_x86 其實就是 Linux 核心原本 (與平台無關) 換成組合語言,我將這兩個 function 的 assmebly 列出,如下:
gcc -S test.c

memchr_x86
memchr_x86:
.LFB0:
	.cfi_startproc
	endbr64
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register 6
	movq	%rdi, -24(%rbp)
	movl	%esi, -28(%rbp)
	movq	%rdx, -40(%rbp)
	cmpq	$0, -40(%rbp)
	jne	.L2
	movl	$0, %eax
	jmp	.L3
.L2:
	movl	-28(%rbp), %eax
	movq	-24(%rbp), %rdx
	movq	-40(%rbp), %rcx
	movq	%rdx, %rdi
#APP
# 9 "test.c" 1
	repne
	scasb
	je 1f
	mov $1,%rdi
1:	dec %rdi
# 0 "" 2
#NO_APP
	movl	%ecx, %eax
	movq	%rdi, %rdx
	movq	%rdx, -8(%rbp)
	movl	%eax, -12(%rbp)
	movq	-8(%rbp), %rax
.L3:
	popq	%rbp
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc
.LFE0:
	.size	memchr_x86, .-memchr_x86
memchr_linux
memchr_linux:
.LFB1:
	.cfi_startproc
	endbr64
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register 6
	movq	%rdi, -24(%rbp)
	movl	%esi, -28(%rbp)
	movq	%rdx, -40(%rbp)
	movq	-24(%rbp), %rax
	movq	%rax, -8(%rbp)
	jmp	.L5
.L7:
	movl	-28(%rbp), %eax
	movl	%eax, %ecx
	movq	-8(%rbp), %rax
	leaq	1(%rax), %rdx
	movq	%rdx, -8(%rbp)
	movzbl	(%rax), %eax
	cmpb	%al, %cl
	jne	.L5
	movq	-8(%rbp), %rax
	subq	$1, %rax
	jmp	.L6
.L5:
	movq	-40(%rbp), %rax
	leaq	-1(%rax), %rdx
	movq	%rdx, -40(%rbp)
	testq	%rax, %rax
	jne	.L7
	movl	$0, %eax
.L6:
	popq	%rbp
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc
.LFE1:
	.size	memchr_linux, .-memchr_linux
	.section	.rodata

可看出 memchr_x86 在比較字串方面比 memchr_linux 少不少,推測這應該是效能較佳的原因。

memchr_opt 和 glibc memchr.c

下圖為 memchr_optglibc-memchr.c 的執行時間比較,測量方式如 memchr 與 strchr 比較 所述,有趣的是兩者執行的時間幾乎相同。

接著觀察 glibc-memchr.c 的程式碼,如下:

void *MEMCHR (void const *s, int c_in, size_t n) { /* On 32-bit hardware, choosing longword to be a 32-bit unsigned long instead of a 64-bit uintmax_t tends to give better performance. On 64-bit hardware, unsigned long is generally 64 bits already. Change this typedef to experiment with performance. */ typedef unsigned long int longword; const unsigned char *char_ptr; const longword *longword_ptr; longword repeated_one; longword repeated_c; unsigned char c; c = (unsigned char) c_in; /* Handle the first few bytes by reading one byte at a time. Do this until CHAR_PTR is aligned on a longword boundary. */ for (char_ptr = (const unsigned char *) s; n > 0 && (size_t) char_ptr % sizeof (longword) != 0; --n, ++char_ptr) if (*char_ptr == c) return (void *) char_ptr; longword_ptr = (const longword *) char_ptr; /* All these elucidatory comments refer to 4-byte longwords, but the theory applies equally well to any size longwords. */ /* Compute auxiliary longword values: repeated_one is a value which has a 1 in every byte. repeated_c has c in every byte. */ repeated_one = 0x01010101; repeated_c = c | (c << 8); repeated_c |= repeated_c << 16; if (0xffffffffU < (longword) -1) { repeated_one |= repeated_one << 31 << 1; repeated_c |= repeated_c << 31 << 1; if (8 < sizeof (longword)) { size_t i; for (i = 64; i < sizeof (longword) * 8; i *= 2) { repeated_one |= repeated_one << i; repeated_c |= repeated_c << i; } } } /* Instead of the traditional loop which tests each byte, we will test a longword at a time. The tricky part is testing if *any of the four* bytes in the longword in question are equal to c. We first use an xor with repeated_c. This reduces the task to testing whether *any of the four* bytes in longword1 is zero. We compute tmp = ((longword1 - repeated_one) & ~longword1) & (repeated_one << 7). That is, we perform the following operations: 1. Subtract repeated_one. 2. & ~longword1. 3. & a mask consisting of 0x80 in every byte. Consider what happens in each byte: - If a byte of longword1 is zero, step 1 and 2 transform it into 0xff, and step 3 transforms it into 0x80. A carry can also be propagated to more significant bytes. - If a byte of longword1 is nonzero, let its lowest 1 bit be at position k (0 <= k <= 7); so the lowest k bits are 0. After step 1, the byte ends in a single bit of value 0 and k bits of value 1. After step 2, the result is just k bits of value 1: 2^k - 1. After step 3, the result is 0. And no carry is produced. So, if longword1 has only non-zero bytes, tmp is zero. Whereas if longword1 has a zero byte, call j the position of the least significant zero byte. Then the result has a zero at positions 0, ..., j-1 and a 0x80 at position j. We cannot predict the result at the more significant bytes (positions j+1..3), but it does not matter since we already have a non-zero bit at position 8*j+7. So, the test whether any byte in longword1 is zero is equivalent to testing whether tmp is nonzero. */ while (n >= sizeof (longword)) { longword longword1 = *longword_ptr ^ repeated_c; if ((((longword1 - repeated_one) & ~longword1) & (repeated_one << 7)) != 0) break; longword_ptr++; n -= sizeof (longword); } char_ptr = (const unsigned char *) longword_ptr; /* At this point, we know that either n < sizeof (longword), or one of the sizeof (longword) bytes starting at char_ptr is == c. On little-endian machines, we could determine the first such byte without any further memory accesses, just by looking at the tmp result from the last loop iteration. But this does not work on big-endian machines. Choose code that works in both cases. */ for (; n > 0; --n, ++char_ptr) { if (*char_ptr == c) return (void *) char_ptr; } return NULL; }

其程式碼之想法和 memchr_opt 一樣,都是採用 64 bits 之 mask 來尋找目標字元。程式碼一開始同樣是對字串做 alignment,接著用相近的方式做出 64 bits 之 mask ,比較 64 bits 之方法同樣採用 xor 等等。 memchr_opt 使採用 DETECT_CHAR 來包裝這樣的操作。總而言之,這兩者是用不同的方式實作相同的想法,因此執行時間上也幾乎一樣。

memchr 和 strchr 比較

strchr 的 Linux 組合語言版本尚有問題須解決,如下面所示,先列出其他之比較。

Error: unsupported instruction `mov'

效能分析之模式參考 https://gist.github.com/nico/2624336 ,測量方式依舊使用 struct timespec ,測量執行兩百次之時間再除以兩百得到執行一次之時間。



下圖是 glibc-memchr.S 和直接呼叫 <string.h>memchr 的比較,兩者的執行時間皆是所有 memchr 版本中前二低,推測 <string.h>memchr 應該是 glibc-memchr.S 的實作。

接著比較 memchrrstrchr 之差別,

由上圖可見,memchr-linuxstrchr-linux 還要快,到 glibc-memchr.cglibc-strchr.c 時依舊有領先,但比到 default 的部分,如下圖,strchr 就比 memchr 還快。

組合語言的部分也是如此