2022q1 Homework4 (quiz4)

contributed by < freshLiver >

第一題 - ceil_log2

EXP1

































// 最高位元 1 在哪
int ceil_log2(uint32_t x)
{
    uint32_t r, shift;

    x--;  // 先減 1 最後就只要無條件進位

    // 檢查最高位 1 在第 16 個位元之後
    r = (x > 0xFFFF) << 4;  // 找到的話 r == 16
    x >>= r;                // 對半切, 結果 + 16 or 00

    // 檢查最高位 1 在第 8 個位元之後
    shift = (x > 0xFF) << 3;  // 找到的話 shift == 8
    x >>= shift;              // 對半切
    r |= shift;               // 結果 + 8 or 0
    
    // 檢查最高位 1 在第 4 個位元之後
    shift = (x > 0xF) << 2;  // 找到的話 shift = 4
    x >>= shift;             // 對半切
    r |= shift;              // 結果 + 4 or 0

    // 檢查最高位 1 在第 2 個位元之後
    shift = (x > 0x3) << 1;  // 找到的話 shift = 2
    x >>= shift;             // 對半切
    r |= shift;              // 結果 + 2 or 0
    
    // 檢查最高位 1 在第 1 個位元之後
    shift = (x > 0x1) << 0;  // 找到的話 shift = 1
    x >>= shift;             // 對半切 (多餘)
    r |= shift;              // 結果 + 1 or 0
    
    return r + 1;  //  x-1 的最高位 1 的位置 +1 (無條件進位)
}

每次分成高低位元兩部份，用二元搜尋檢查最高位元的 1 是在哪個部份，總共重複

l o g_{2} (32) = 5

次，即上方程式碼的 8 至 30 行的部份。

而若是單純重複五次的話，有些操作會是多餘的，例如第 29 行的位移就是多餘的，第五次檢查的結果（第 28 行）也可以直接與第四次的結果透過 | 相加，因此最後可以將：








    shift = (x > 0x3) << 1;  // 找到的話 shift = 2
    x >>= shift;             // 對半切
    r |= shift;              // 可與後面的運算合併
    
    // 檢查最高位 1 在第 1 個位元之後
    shift = (x > 0x1) << 0;  // ( << 0 多餘，可與 25 行合併)
    x >>= shift;             // (多餘)
    r |= shift;              // (可與 25 行合併)

簡化成：





    shift = (x > 0x3) << 1;  // 找到的話 shift = 2
    x >>= shift;             // 對半切
    r = r | shift | x > 1;
        
    return (EXP1) + 1;

因此 EXP1 應為 r | shift | x > 1。

TODO : x > 1 改為 x >> 1 的表現

比較需要注意的是，若是

2^{n - 1} < x < 2^{n}

的話，結果應為

⌈ l o g_{2} (x) ⌉ = n

；但若是

x

恰好為

2^{n - 1}

的話，結果則應為

⌈ l o g_{2} (x) ⌉ = n - 1

，因此在一開始就先進行 x-- 讓

x = 2^{n - 1}

的特例消失，最後再透過 +1 補回即可。

維持 Branchless 並解決 x 為 0 時的特例

若 x 為 0 時 fls 應為 1


    x -= !!x;

在 x == 0 時，只需要一個位元即可儲存，在這種情況下 x 不需要進行 --，因此最後的 (r | shift | x > 1) 部份會是 0，但與其他情況相同，會進行無條件進位，因此恰好可以處理 x == 0 的狀況。

若 x 為 0 時 fls 應為 0

若 x 為 0 的結果需要是 0 的話，則連最後的無條件進位都不需要，因此需要額外使用變數儲存原始的 x （或是不直接對 x 操作，而是對另一變數進行位移檢查）：

int ceil_log2(uint32_t x)
{
    bool notZero = x;
    uint32_t r, shift;

    x -= notZero;  // 先減 1 最後就只要無條件進位

    ...

    // 結果加 shift 並檢查最高位 1 是否在第 1 位元後
    //  x-1 的最高位 1 的位置 +1 (無條件進位)
    return (r | shift | x > 1) + notZero;
}

若 x 為 0 時 fls 應為 `-HUGE_VAL`

在 log(x) 的 Man Page 中有提到當 x 為 0 時的處理方式：

If x is zero, then a pole error occurs, and the functions return -HUGE_VAL, -HUGE_VALF, or -HUGE_VALL, respectively.

而關於這 HUGE_VAL 的說明可以從 n1570 的 Annex F.10 Mathematics <math.h> 找到：

The Standard C macro HUGE_VAL and its float and long double analogs, HUGE_VALF and HUGE_VALL, expand to expressions whose values are positive infinities.

可以看到這三個巨集分別用來表示 double、float、long double 三種長度的浮點數型別的「無窮大」，因此 -HUGE_VAL 就代表了三種浮點數型別的「負無窮大」。而由於這題的參數是 32 位元的整數，所以這邊選擇以 logf 作為參考，改以 float 作為回傳型別，並在 x 為 0 時回傳 HUGE_VALF。

而為了以 Branchless 的方式在 x 為 0 時回傳 HUGE_VALF，這個實作以若 x 為 0 時 fls 應為 0 的實作為基礎進行調整：

float ceil_log2(uint32_t x)
{
    uint32_t r, shift;

    float result = -HUGE_VALF;
    uint32_t resultMask = (-!x) & (*(uint32_t *) &result);  // 0 if x != 0

    x -= !resultMask;  // 先減 1 最後就只要無條件進位

    ...


    result = (r | shift | x > 1) + !resultMask;
    resultMask |= *(uint32_t *) &result;

    // check huge val
    return *(float *) &resultMask;
}

首先將 result 初始化為 -HUGE_VALF，並利用另一個 32 位元的整數 resultMask 儲存 -!x 與 result 對應的 32 位元記憶體內容進行 AND 運算的結果：

當 x == 0 時： resultMask == -HUGE_VALF
當 x != 0 時： resultMask == 0

接著計算 ilog2 的結果（x 為 0 時 result 也為 0）並以 float 型別儲存，此時可以發現：

當 x == 0 時：result == 0f、resultMask == -HUGE_VALF
當 x != 0 時：result == ceil_ilog2(x)、resultMask == 0

因此最後只要將 result 的記憶體內容與 resultMask 進行 OR 運算，並以 float 形式回傳即可以 Branchless 的方式在 x 為零時回傳 -HUGE_VALF。

找出並說明 Linux 核心中的實作

使用

$ grep -rP --include="*.[ch]" "define [^ ]*ilog[^ ]*\\("

可以找到 ilog 相關的巨集定義，然後看到找到

#define ROUNDUP_LOG2(x) ilog2(roundup_pow_of_two(x))

出現在 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h 中。而它使用到的 ilog 以及 roundup_pow_of_two 都是定義在 include/linux/log2.h 中。

ilog2 巨集

首先先看 ilog 的部份：

/**
 * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
 * @n: parameter
 *
 * constant-capable log of base 2 calculation
 * - this can be used to initialise global variables from constant data, hence
 * the massive ternary operator construction
 *
 * selects the appropriately-sized optimised version depending on sizeof(n)
 */
#define ilog2(n) \
( \
	__builtin_constant_p(n) ?	\
	((n) < 2 ? 0 :			\
	 63 - __builtin_clzll(n)) :	\
	(sizeof(n) <= 4) ?		\
	__ilog2_u32(n) :		\
	__ilog2_u64(n)			\
 )

可以看到它使用了 __builtin_constant_p(n) 來判斷 n 在編譯期間是否為常數，若是的話就會使用 (n) < 2 ? 0 : 63 - __builtin_clzll(n)，推測是編譯器在編譯時期就可以直接計算出結果。

而為了驗證這個推測，可以利用前面提到的 GCC 的 __builtin_constant_p 來做簡單的測試：

#define var_log2_is_const(var)                                           \
    ({                                                                   \
        bool _res = __builtin_constant_p(ilog2(var)) ? true : false;     \
        printf("%s(%d) \tis%sconst\n", #var, var, _res ? " " : " not "); \
    })


int main(int argc, char const *argv[])
{
    var_log2_is_const(156);

    int var1 = 343;
    var_log2_is_const(var1);

    const int var2 = 465;
    var_log2_is_const(var2);

    static int var3 = 546;
    var_log2_is_const(var3);

    int var4 = atoi(argv[1]);
    var_log2_is_const(var4);

    return 0;
}

這個測試程式可以用來檢查 ilog2 的結果是否能夠在編譯時期得知，其中 ilog2 的實作為：

#define ilog2(n) \
    (__builtin_constant_p(n) ? ((n) < 2 ? 0 : 63 - __builtin_clzll(n)) : fls(n))

static __always_inline int fls(unsigned int x)
{
    int r = 32;

    if (!x)
        return 0;
    if (!(x & 0xffff0000u)) {
        x <<= 16;
        r -= 16;
    }
    if (!(x & 0xff000000u)) {
        x <<= 8;
        r -= 8;
    }
    if (!(x & 0xf0000000u)) {
        x <<= 4;
        r -= 4;
    }
    if (!(x & 0xc0000000u)) {
        x <<= 2;
        r -= 2;
    }
    if (!(x & 0x80000000u)) {
        x <<= 1;
        r -= 1;
    }
    return r;
}

為了方便進行測試，將 n 不為常數時的實作改用其他方式實作，但不影響這次測試的結果。

而測試結果則為：

$ make test OPT_LV=0
gcc -O0 const_log2.c -o const_log2.out
./const_log2.out 111
156(156)        is const
var1(343)       is not const
var2(465)       is not const
var3(546)       is not const
var4(111)       is not const

$ make test OPT_LV=1
gcc -O1 const_log2.c -o const_log2.out
./const_log2.out 111
156(156)        is const
var1(343)       is not const
var2(465)       is const
var3(546)       is not const
var4(111)       is not const

可以看到當 ilog2(n) 的 n 以常數或是 static 變數帶入時，可以看到 ilog(n) 的結果確實可以在編譯時期就知道結果，與推測相符。

但比較意外的是以 const 修飾的變數 var1 反而沒辦法在編譯時期就得知結果。

TODO

若是非常數的話則會根據 32 或是 64 位元使用對應的 ilog2 函式處理：

/*
 * non-constant log of base 2 calculators
 * - the arch may override these in asm/bitops.h if they can be implemented
 *   more efficiently than using fls() and fls64()
 * - the arch is not required to handle n==0 if implementing the fallback
 */
#ifndef CONFIG_ARCH_HAS_ILOG2_U32
static __always_inline __attribute__((const))
int __ilog2_u32(u32 n)
{
	return fls(n) - 1;
}
#endif

#ifndef CONFIG_ARCH_HAS_ILOG2_U64
static __always_inline __attribute__((const))
int __ilog2_u64(u64 n)
{
	return fls64(n) - 1;
}
#endif

可以看到這邊直接使用了 fls 來找最高位的 1，且可以從 #ifndef 的使用以及註解可以看到，若是硬體架構能夠提供比 fls 還要快速的計算方式的話，就會使用對應的 ilog 方式處理，而有提供的架構可以用以下方式尋找：

$ grep -ir --include="*" "HAS_ILOG2"
include/linux/log2.h:#ifndef CONFIG_ARCH_HAS_ILOG2_U32
include/linux/log2.h:#ifndef CONFIG_ARCH_HAS_ILOG2_U64
arch/parisc/Kconfig:config ARCH_HAS_ILOG2_U32
arch/parisc/Kconfig:config ARCH_HAS_ILOG2_U64
arch/m68k/Kconfig:config ARCH_HAS_ILOG2_U32
arch/m68k/Kconfig:config ARCH_HAS_ILOG2_U64
arch/s390/Kconfig:config ARCH_HAS_ILOG2_U32
arch/s390/Kconfig:config ARCH_HAS_ILOG2_U64
arch/alpha/Kconfig:config ARCH_HAS_ILOG2_U32
arch/alpha/Kconfig:config ARCH_HAS_ILOG2_U64
arch/xtensa/Kconfig:config ARCH_HAS_ILOG2_U32
arch/xtensa/Kconfig:config ARCH_HAS_ILOG2_U64
arch/sh/Kconfig:config ARCH_HAS_ILOG2_U32
arch/sh/Kconfig:config ARCH_HAS_ILOG2_U64
arch/arm/Kconfig:config ARCH_HAS_ILOG2_U32
arch/arm/Kconfig:config ARCH_HAS_ILOG2_U64
arch/microblaze/Kconfig:config ARCH_HAS_ILOG2_U32
arch/microblaze/Kconfig:config ARCH_HAS_ILOG2_U64

可以發現 x86 並未出現在上面的結果中，因此會使用上方的實作用 fls 進行計算，而 fls 則在 x86 架構中有提供更快的處理方式，可以在 arch/x86/include/asm/bitops.h 看到它的實作是使用 BSLR 指令處理。

嘗試到上方列出的 arm 架構的目錄下透過 $ grep -ir --include="*.[ch]" "ilog2" 尋找 ilog2 的實作，但沒有找到 ilog2 的相關實作。

但在 arch 目錄下尋找的話，則會發現反而是上方沒有列出的 powerpc 架構下有提供相關實作，定義在 arch/powerpc/boot/ops.h 中。

fls 函式

/**
 * fls - find last set bit in word
 * @x: the word to search
 *
 * This is defined in a similar way as the libc and compiler builtin
 * ffs, but returns the position of the most significant set bit.
 *
 * fls(value) returns 0 if value is 0 or the position of the last
 * set bit if value is nonzero. The last (most significant) bit is
 * at position 32.
 */
static __always_inline int fls(unsigned int x)
{
	int r;

#ifdef CONFIG_X86_64
	/*
	 * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before, except that the
	 * top 32 bits will be cleared.
	 *
	 * We cannot do this on 32 bits because at the very least some
	 * 486 CPUs did not behave this way.
	 */
	asm("bsrl %1,%0"
	    : "=r" (r)
	    : "rm" (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
	asm("bsrl %1,%0\n\t"
	    "cmovzl %2,%0"
	    : "=&r" (r) : "rm" (x), "rm" (-1));
#else
	asm("bsrl %1,%0\n\t"
	    "jnz 1f\n\t"
	    "movl $-1,%0\n"
	    "1:" : "=r" (r) : "rm" (x));
#endif
	return r + 1;
}

BSF (Bit Scan Forward) 與 ffs 的作用相同，是從最低位元開始沼地一個 1 的位置，而 BSR (Bit Scan Reverse) 指令則是從最高位元開始找，即與 fls 的作用相同。

比較特別的部份是當 x 為 0 時 fls，的輸出應為 0，然而根據 Commit Message 的說明，Intel64 與 AMD64 的 BSR 指令在 x 為 0 時會有不同的表現：

Intel64：

在 Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 2A: Instruction Set Reference, A-L 的 3.2 INSTRUCTIONS (A-L) 中 BSR 說明了 x 為 0 時目標暫存器的值是未定義：

If the content source operand is 0, the content of the destination operand is undefined.

The ZF flag is set to 1 if the source operand is 0;
AMD64：

然而在 AMD 對 BSR 的指令說明文件中則寫到當 x 為 0 時目標暫存器的值不會被修改。

If the second operand contains 0, the instruction sets ZF to 1 and does not change the contents of the destination register.

而在這則 Commit 前的實作是：

static inline int fls(int x)
{
	int r;
#ifdef CONFIG_X86_CMOV
	asm("bsrl %1,%0\n\t"
	    "cmovzl %2,%0"
	    : "=&r" (r) : "rm" (x), "rm" (-1));
#else
	asm("bsrl %1,%0\n\t"
	    "jnz 1f\n\t"
	    "movl $-1,%0\n"
	    "1:" : "=r" (r) : "rm" (x));
#endif
	return r + 1;
}

這個實作只區別了是否有 CMOVZ 指令能夠快速在 ZF Flag 為 1 時（即 x 為 0 時）將輸出的變數 r 設為 -1；若沒有 CMOVZ 的話則需要使用更多指令才能將 r 設為 -1。

而在該 Commit 訊息中有提到：

The Intel x86_64 specification, however, says that the result of BSR/BSF in such a case is undefined. That said, when queried, one of the Intel CPU architects said that the behaviour on all Intel CPUs is that:

with BSRQ/BSFQ, the 64-bit destination register is written with its original value if the source is 0, thus, in essence, giving the effect we want. And,

with BSRL/BSFL, the lower half of the 64-bit destination register is written with its original value if the source is 0, and the upper half is cleared, thus giving us the effect we want (we return a 4-byte int).

Further, it was indicated that they (Intel) are unlikely to get away with changing the behaviour.

CS:APP 3.4.2 WLQ suffix

從這個說明可以看到在 x86_64 架構下，不論是對 64 位元資料或是 32 位元資料操作，其實都不會修改到輸出的 64 或 32 位元暫存器的內容，因此若是在一開始就將目標暫存器設為 -1 的話，就可以避免 CMOVZ 這個 branch。

從該 Patch 的討論來看，這則 Commit 訊息中的 one of the Intel CPU architects 看起來是有人直接詢問 Intel 的架構師。

因此在這個 Patch 中進行了以下修改：

- #ifdef CONFIG_X86_CMOV
+ 
+ #ifdef CONFIG_X86_64
+ 	/*
+ 	 * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
+ 	 * dest reg is undefined if x==0, but their CPU architect says its
+ 	 * value is written to set it to the same as before, except that the
+ 	 * top 32 bits will be cleared.
+ 	 *
+ 	 * We cannot do this on 32 bits because at the very least some
+ 	 * 486 CPUs did not behave this way.
+ 	 */
+ 	long tmp = -1;
+ 	asm("bsrl %1,%0"
+ 	    : "=r" (r)
+ 	    : "rm" (x), "0" (tmp));
+ #elif defined(CONFIG_X86_CMOV)

可以看到當架構為 x86_64 時，透過 GNU C Extension 中 Extended Asm 中 InputOperands 的 constraint 特性：

Input constraints can also be digits (for example, "0"). This indicates that the specified input must be in the same place as the output constraint at the (zero-based) index in the output constraint list.

這會讓變數 tmp 直接使用第 0 個 Output Operand ，也就是變數 r，對應的暫存器，即相當於將變數 r 的初始值設為 -1，因此不需要使用 CMOVZ 進行額外確認。

這個 Commit 是另外用了一個變數儲存 -1 作為 r 的初始值，但在之後的 Commit 指出了指令的結果以及回傳值都是 32 位元，因此可以不用另外用一個 long 型別的變數 tmp 儲存 -1，直接寫成

    asm("bsrl %1,%0"
 	    : "=r" (r)
 	    : "rm" (x), "0" (-1));

就可以了。

在 64 位元的 fls 實作中：

#ifdef CONFIG_X86_64
static __always_inline int fls64(__u64 x)
{
	int bitpos = -1;
	/*
	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
	 * dest reg is undefined if x==0, but their CPU architect says its
	 * value is written to set it to the same as before.
	 */
	asm("bsrq %1,%q0"
	    : "+r" (bitpos)
	    : "rm" (x));
	return bitpos + 1;
}
#else
#include <asm-generic/bitops/fls64.h>
#endif

可以看到它是直接將儲存結果的變數 bitops 初始值設為 -1，而不像 fls 是使用 Input Constraint 設定初始值，不太明白為什麼 fls 不使用相同的方式就好。

而在該 Commit 訊息中也有簡單的測試了使用這個特性後的效能改進，可以看到修改後（第三個結果）的執行時間只有原本（第二個結果）的一半左右：

$ time ./get_order
    real    1m37.191s
    user    1m36.313s
    sys     0m0.861s
$ time ./get_order x
    real    0m16.892s
    user    0m16.586s
    sys     0m0.287s
$ time ./get_order x x
    real    0m7.731s
    user    0m7.727s
    sys     0m0.002s

而若硬體架構沒有支援 fls 的話，則會使用 include/asm-generic/bitops/ 中 fls.h 以及 fls64.h 各自的實作，其中 32 位元版本的 fls 實作如下：

/**
 * fls - find last (most-significant) bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
 */

static __always_inline int fls(unsigned int x)
{
	int r = 32;

	if (!x)
		return 0;
	if (!(x & 0xffff0000u)) {
		x <<= 16;
		r -= 16;
	}
	if (!(x & 0xff000000u)) {
		x <<= 8;
		r -= 8;
	}
	if (!(x & 0xf0000000u)) {
		x <<= 4;
		r -= 4;
	}
	if (!(x & 0xc0000000u)) {
		x <<= 2;
		r -= 2;
	}
	if (!(x & 0x80000000u)) {
		x <<= 1;
		r -= 1;
	}
	return r;
}

概念上與這題使用的技巧相似，但是位移方向相反，當最高位元的 1 不在高位元部份時，就左移掉高位元部份。而 64 位元版本則根據 BITS_PER_LONG 分成兩種實作方式：

#if BITS_PER_LONG == 32
static __always_inline int fls64(__u64 x)
{
	__u32 h = x >> 32;
	if (h)
		return fls(h) + 32;
	return fls(x);
}
#elif BITS_PER_LONG == 64
static __always_inline int fls64(__u64 x)
{
	if (x == 0)
		return 0;
	return __fls(x) + 1;
}
#else
#error BITS_PER_LONG not 32 or 64
#endif

當 BITS_PER_LONG 為 32 位元時，就先檢查最高位的 1 在哪側決定是否要 + 32，接著再以 32 位元版本的 fls 找出實際位置。而當 BITS_PER_LONG 為 64 位元時，則是只檢查 x 為 0 的特例，其他狀況則是用 __fls() 計算最高位 1 的位置，並將結果加一使 fls64() 回傳的 Index 是從 1 開始數。

fls64.h 中只有 #include <asm/types.h> 這個不包含 fls 以及 __fls 的標頭檔，且根據 Commit 訊息，它是因為 __u32 與 __u64 未被宣告才被加入的，最初的實作甚至連 include 都沒有，看不出來這邊的 fls 以及 __fls 是使用哪裡的實作。

Round Up

而 roundup_pow_of_two 也與 ilog2 相似，會先檢查參數在編譯期間是否為常數，並採取不同的策略：

/**
 * roundup_pow_of_two - round the given value up to nearest power of two
 * @n: parameter
 *
 * round the given value up to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define roundup_pow_of_two(n)			\
(						\
	__builtin_constant_p(n) ? (		\
		((n) == 1) ? 1 :		\
		(1UL << (ilog2((n) - 1) + 1))	\
				   ) :		\
	__roundup_pow_of_two(n)			\
 )

當 n 為常數時，會使用前面說明的 ilog2 巨集，並用了與這題相同的技巧（先找 n - 1 的 ilog 再將結果加一），最後再進行左移即可得到

2^{⌈ i l o g (n) ⌉}

，即首個大於等於 n 的 2 的冪。

而在 n 不為常數時則會呼叫 __roundup_pow_of_two 這個函式：









/**
 * __roundup_pow_of_two() - round up to nearest power of two
 * @n: value to round up
 */
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
	return 1UL << fls_long(n - 1);
}

其中的 fls_long 則是定義在 include/linux/bitops.h 中，會根據 long 型別的長度判斷要用 32 函式 64 位元的 fls 實作：

static inline unsigned fls_long(unsigned long l)
{
	if (sizeof(l) == 4)
		return fls(l);
	return fls64(l);
}

型別 size 是固定的，應該也可用 #ifdef 判斷 BITS_PER_LONG 來決定要用哪個時長度的。

Linux 核心中的應用情景

drivers/net/ethernet/mellanox/mlx4/mlx4_en.h

第二題

EXP2, EXP3
































/**
 * 與 fls 相反
 * 每次分成兩半並檢查最低位 1 是否在低位元半部
 * 當 ffs 為 0 時則回傳 0
 * 否則回傳從 1 開始計算的位置
 */
static inline size_t ffs(unsigned long x)
{
    // special case
    if (x == 0)
        return 0;

    size_t shift = BITS_PER_LONG;
    shift >>= 1;  // 第一次右移的量為 LONG 長度的一半

    unsigned long mask = ~0UL;
    mask >>= shift;  // bit mask，檢查最低位 1 用

    size_t i = 1;
    while (shift) {
        // EXP2，檢查 1 是否不在低位元半部
        if ((x & mask) == 0) {
            x >>= shift;  // 位移掉低位元半部
            i += shift;   // EXP3，將移除的長度加到結果上
        }

        shift >>= 1;     // 縮減下次要檢查的長度
        mask >>= shift;  // 縮小下次 mask 範圍
    }

    return i;
}

Branchless ffs




































static inline size_t ffs_nb(unsigned long x)
{
    size_t maskNotZero = -!!x;

    // check lower 32 bits
    size_t shift = !(x & 0xffffffffULL) << 5;
    size_t pos = shift;
    x >>= shift;

    // check lower 16 bits
    shift = !(x & 0x0000ffff) << 4;
    pos |= shift;
    x >>= shift;

    // check lower 8 bits
    shift = !(x & 0x000000ff) << 3;
    pos |= shift;
    x >>= shift;

    // check lower 4 bits
    shift = !(x & 0x0000000f) << 2;
    pos |= shift;
    x >>= shift;

    // check lower 2 bits
    shift = !(x & 0x00000003) << 1;
    pos |= shift;
    x >>= shift;

    // check lower 1 bit
    shift = !(x & 0x00000001) << 0;
    pos |= shift;
    x >>= shift;

    return (pos + 1) & maskNotZero;
}

簡化

第三題 -

第四題 -

第五題 - READ_ONCE

為什麼需要 READ_ONCE

如同 LWN - ACCESS_ONCE() 這篇文章中的說明，在某些情況下，編譯器可能會為了最佳化調整某些程式碼的執行順序，但在這在某些情況下會出現問題，例如這題的題目敘述的情形，因此需要有些機制來保證某些值不會被改變。

ACCESS_ONCE 巨集的誕生

而根據 6.7.3 Type qualifiers 中註腳 134 的說明：

A volatile declaration may be used to describe an object corresponding to a memory-mapped input/output port or an object accessed by an asynchronously interrupting function. Actions on objects so declared shall not be ‘‘optimized out’’ by an implementation or reordered except as permitted by the rules for evaluating expressions.

可以看到有被 volatile 修飾過的變數不應在編譯器最佳化時被重排或是移除。

但若是一變數在宣告時就指定了 volatile 的話，可能會造成即使是沒有問題的部份也不能被編譯器最佳化，因此在 Linux 核心中使用了 ACCESS_ONCE 這個巨集來「暫時」將變數轉成 volatile，以讓特定部份的程式碼可以避免被重排，其實作如下：

#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))

很單純的，就是將一變數的地址取出，並轉形成對應型別的 volatile 版本後再取值使用。

ACCESS_ONCE 巨集的問題

然而根據 LWN - ACCESS_ONCE() and compiler bugs 以及這則 Commit 訊息中的說明，這種寫法的 volatile 會在 x 不為 scalar type 時被編譯器忽略，造成實際執行情況可能會不符合預期。

因此在該 Commit 以及其後的幾個相關 Commit 中對 ACCESS_ONCE 巨集做了些修正：

新增 `READ_ONCE` 以及 `ASSIGN_ONCE` 的實作 [Commit]

/*
 * Prevent the compiler from merging or refetching reads or writes. The
 * compiler is also forbidden from reordering successive instances of
 * READ_ONCE, ASSIGN_ONCE and ACCESS_ONCE (see below), but only when the
 * compiler is aware of some particular ordering.  One way to make the
 * compiler aware of ordering is to put the two invocations of READ_ONCE,
 * ASSIGN_ONCE or ACCESS_ONCE() in different C statements.
 *
 * In contrast to ACCESS_ONCE these two macros will also work on aggregate
 * data types like structs or unions. If the size of the accessed data
 * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
 * READ_ONCE() and ASSIGN_ONCE()  will fall back to memcpy and print a
 * compile-time warning.
 *
 * Their two major use cases are: (1) Mediating communication between
 * process-level code and irq/NMI handlers, all running on the same CPU,
 * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
 * mutilate accesses that either do not require ordering or that interact
 * with an explicit memory barrier or atomic instruction that provides the
 * required ordering.
 */

#define READ_ONCE(x)                                 \
    ({                                               \
        typeof(x) __val;                             \
        __read_once_size(&x, &__val, sizeof(__val)); \
        __val;                                       \
    })

#define ASSIGN_ONCE(val, x)                            \
    ({                                                 \
        typeof(x) __val;                               \
        __val = val;                                   \
        __assign_once_size(&x, &__val, sizeof(__val)); \
        __val;                                         \
    })

為方便解讀，上方的巨集實作為原實作格式化後的結果。

`__read_once_size`

static __always_inline void __read_once_size(volatile void *p, void *res, int size)
{
	switch (size) {
	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
#ifdef CONFIG_64BIT
	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
#endif
	default:
		barrier();
		__builtin_memcpy((void *)res, (const void *)p, size);
		data_access_exceeds_word_size();
		barrier();
	}
}

`__assign_once_size` 分別是：

static __always_inline void __assign_once_size(volatile void *p, void *res, int size)
{
	switch (size) {
	case 1: *(volatile __u8 *)p = *(__u8 *)res; break;
	case 2: *(volatile __u16 *)p = *(__u16 *)res; break;
	case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
#ifdef CONFIG_64BIT
	case 8: *(volatile __u64 *)p = *(__u64 *)res; break;
#endif
	default:
		barrier();
		__builtin_memcpy((void *)p, (const void *)res, size);
		data_access_exceeds_word_size();
		barrier();
	}
}

`data_access_exceeds_word_size`

static __always_inline void data_access_exceeds_word_size(void)
#ifdef __compiletime_warning
__compiletime_warning("data access exceeds word size and won't be atomic")
#endif
;

static __always_inline void data_access_exceeds_word_size(void)
{
}

讓 `ACCESS_ONCE` 限定 Scalar Type 使用 [Commit]

- #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+ #define __ACCESS_ONCE(x) ({ \
+      __maybe_unused typeof(x) __var = 0; \
+     (volatile typeof(x) *)&(x); })
+ #define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))

透過另外建立一個與參數 x 相同的變數 __var 並將其賦值為 0 來確保 x 的型別為 Scalar Type。

__maybe_unused 是被定義在 include/linux/compiler_attributes.h 的一個巨集，實際的定義為 __attribute__((__unused__))。

而 __attribute__(()) 是 GNU C Extension 之一，根據文件的說明：

This attribute, attached to a variable, means that the variable is meant to be possibly unused. GCC will not produce a warning for this variable.

可以知道它只是為了避免編譯器產生「無用變數」的警示訊息。

而在這個實作中使用這個 attribute 的原因是 __var 只是用來檢查型別，實際上並未用到，即編譯器會警告的「無用變數」。