2024q1 第 7 週測驗題

目的: 檢驗學員對 bitwise 的認知

作答表單: 測驗 1 (針對 Linux 核心「設計」課程)
作答表單: 測驗 2 (針對 Linux 核心「實作」課程)

測驗 `1`

SIMD within a register (SWAR) 是軟體最佳化技巧之一，以下展示 SWAR 運用於 64 位元微處理器架構，原本判斷 2 個 32 位元寬度的整數是否都是奇數 (odd)，可能會這樣撰寫:

#include <stdint.h>
bool both_odd(uint32_t x, uint32_t y)
{
    return (x & 1) && (y & 1);
}

但我們可先組合 (compound) 2 個 32 位元寬度的整數為 1 個 64 位元整數，再運用特製的 bitmask，從而減少運算量:

static uint64_t SWAR_ODD_MASK = (1L << 32) + 1;
bool both_odd_swar(uint64_t xy) {
    return (xy & SWAR_ODD_MASK) == SWAR_ODD_MASK;
}

測試程式:

static inline uint64_t bit_compound(uint32_t x, uint32_t y)
{
    return ((0L + x) << 32) | ((y + 0L) & (-1L >> 32));
}

int main()
{
    int x = 12345678;
    int y = 9012345;
    uint64_t xy = bit_compound(x, y);
    printf("%d, %d\n", both_odd(x, y), both_odd_swar(xy));
}

延伸閱讀: SIMD and SWAR Techniques

在 Linux 核心原始程式碼中，lib/string.c 具備 memchr 的實作:

/**
 * memchr - Find a character in an area of memory.
 * @s: The memory area
 * @c: The byte to search for
 * @n: The size of the area.
 *
 * returns the address of the first occurrence of @c, or %NULL
 * if @c is not found
 */
void *memchr(const void *s, int c, size_t n)
{
    const unsigned char *p = s;
    while (n-- != 0) {
        if ((unsigned char)c == *p++) {
            return (void *)(p - 1);
        }
    }
    return NULL;
}

測試程式:

int main()
{   
    const char str[] = "https://wiki.csie.ncku.edu.tw";
    const char ch = '.';
    
    char *ret = memchr_opt(str, ch, strlen(str));
    printf("String after |%c| is - |%s|\n", ch, ret);
    return 0;
}

預期執行結果:

String after |.| is - |.csie.ncku.edu.tw|

利用上述 SIMD within a register (SWAR) 的技巧，我們可改寫為以下 memchr_opt 函式:

#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>

/* Nonzero if X is not aligned on a "long" boundary */
#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))

/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))

/* Threshhold for punting to the bytewise iterator */
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)

#if LONG_MAX == 2147483647L
#define DETECT_NULL(X) (((X) - (0x01010101)) & ~(X) & (0x80808080))
#else
#if LONG_MAX == 9223372036854775807L
/* Nonzero if X (a long int) contains a NULL byte. */
#define DETECT_NULL(X) \
    (((X) - (0x0101010101010101)) & ~(X) & (0x8080808080808080))
#else
#error long int is not a 32bit or 64bit type.
#endif
#endif

/* @return nonzero if (long)X contains the byte used to fill MASK. */
#define DETECT_CHAR(X, mask) DETECT_NULL(AAAA)

void *memchr_opt(const void *str, int c, size_t len)
{
    const unsigned char *src = (const unsigned char *) str;
    unsigned char d = c;

    while (UNALIGNED(src)) {
        if (!len--)
            return NULL;
        if (*src == d)
            return (void *) src;
        src++;
    }

    if (!TOO_SMALL(len)) {
        /* If we get this far, len is large and src is word-aligned. */

        /* The fast code reads the source one word at a time and only performs
         * a bytewise search on word-sized segments if they contain the search
         * character. This is detected by XORing the word-sized segment with a
         * word-sized block of the search character, and then checking for the
         * presence of NULL in the result.
         */
        unsigned long *asrc = (unsigned long *) src;
        unsigned long mask = d << 8 | d;
        mask |= mask << 16;
        for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
            mask |= BBBB;

        while (len >= LBLOCKSIZE) {
            if (DETECT_CHAR(CCCC, mask))
                break;
            asrc++;
            len -= LBLOCKSIZE;
        }

        /* If there are fewer than LBLOCKSIZE characters left, then we resort to
         * the bytewise loop.
         */
        src = (unsigned char *) asrc;
    }

    while (len--) {
        if (*src == d)
            return (void *) src;
        src++;
    }

    return NULL;
}

請補完程式碼，使上述 memchr_opt 的實作符合 memchr 行為，作答規範:

以最精簡的程式碼撰寫
以第一次作業規範的程式碼風格書寫
AAAA, BBBB, CCCC 皆為表示式

延伸問題:

解釋上述程式碼運作原理
比較 Linux 核心原本 (與平台無關) 的實作和 memchr_opt，設計實驗來觀察隨著字串長度和特定 pattern 變化的效能影響
研讀 2022 年學生報告，在 Linux 核心原始程式碼找出 x86_64 或 arm64 對應的最佳化實作，跟上述程式碼比較，並嘗試舉出其中的策略和分析

下一步：貢獻程式碼到 Linux 核心
Phoronix 報導

測驗 `2`

若干處理器架構無法有效地處理除法，且我們想避免除法造成的例外狀況 (如 Division by zero)，於是我們考慮以下實作:

#include <stdbool.h>
#include <stdint.h>

#define likely(x) __builtin_expect(!!(x), 1)

static inline bool is_power_of_2(unsigned long n)
{
    return (n != 0 && ((n & (n - 1)) == 0));
}

static inline int __ilog2_u32(uint32_t x)
{
    return x ? sizeof(x) * 8 - __builtin_clz(x) - 1 : 0;
}

static inline int __ilog2_u64(uint64_t x)
{
    uint32_t h = x >> 32;
    if (h)
        return sizeof(h) * 8 - __builtin_clz(h) + 31;
    return x ? sizeof(x) * 8 - __builtin_clz(x) - 1 : 0;
}

/* Calculates the logarithm base 2 of a 32-bit or 64-bit unsigned integer.
 * This function performs a constant-capable log base 2 computation, allowing
 * it to initialize global variables with constant data, which explains the
 * extensive use of ternary operators.
 * It automatically chooses the optimized version suitable for the size of 'n',
 * based on whether it is 32-bit or 64-bit.
 */
#define ilog2(n)                                                       \
    (__builtin_constant_p(n) ? ((n) < 2 ? 0 : 63 - __builtin_clzll(n)) \
     : (sizeof(n) <= 4)      ? __ilog2_u32(n)                          \
                             : __ilog2_u64(n))

/* In cases where the divisor is a constant, we compute its inverse during
 * compile time. This allows us to replace the division operation with several
 * inline multiplications, significantly improving performance.
 */
#define __div64_const32(n, ___b)                                            \
    ({                                                                      \
        /* This approach multiplies by the reciprocal of b: to divide n by  \
         * b, we multiply n by (p / b) and then divide by p. The            \
         * efficiency comes from the compiler's ability to optimize much    \
         * of this code during compile time through constant propagation,   \
         * leaving behind only a few multiplication operations. This is     \
         * why we use this large macro, as a static inline function         \
         * does not consistently achieve the same optimization.             \
         */                                                                 \
        uint64_t ___res, ___x, ___t, ___m, ___n = (n);                      \
        bool ___bias;                                                       \
                                                                            \
        /* determine MSB of b */                                            \
        uint32_t ___p = 1 << ilog2(___b);                                   \
                                                                            \
        /* compute m = ((p << 64) + b - 1) / b */                           \
        ___m = (~0ULL / ___b) * ___p;                                       \
        ___m += (((~0ULL % ___b + 1) * ___p) + ___b - 1) / ___b;            \
                                                                            \
        /* one less than the dividend with highest result */                \
        ___x = ~0ULL / ___b * ___b - 1;                                     \
                                                                            \
        /* test our ___m with res = m * x / (p << 64) */                    \
        ___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32;         \
        ___t = ___res += (___m & 0xffffffff) * (___x >> 32);                \
        ___res += (___x & 0xffffffff) * (___m >> 32);                       \
        ___t = (___res < ___t) ? (1ULL << 32) : 0;                          \
        ___res = (___res >> 32) + ___t;                                     \
        ___res += (___m >> 32) * (___x >> 32);                              \
        ___res /= ___p;                                                     \
                                                                            \
        /* Next, we will clean up and enhance the efficiency of our current \
         * implementation.                                                  \
         */                                                                 \
        if (~0ULL % (___b / (___b & -___b)) == 0) {                         \
            /* special case, can be simplified to ... */                    \
            ___n /= (___b & -___b);                                         \
            ___m = ~0ULL / (___b / (___b & -___b));                         \
            ___p = 1;                                                       \
            ___bias = true;                                                 \
        } else if (___res != ___x / ___b) {                                 \
            /* It is necessary to introduce a bias to counteract errors     \
             * caused by bit truncation. Without this bias, an extra bit    \
             * would be required to accurately represent the variable 'm',  \
             * which would exceed the capacity of a 64-bit variable. To     \
             * manage this, the solution involves setting 'm' equal to 'p'  \
             * divided by 'b', and 'n' divided by 'b' as equal to           \
             * (n * m + m) / p. This approach is taken because avoiding the \
             * bias is not feasible due to the limitations imposed by bit   \
             * truncation errors and the 64-bit variable size constraint.   \
             */                                                             \
            ___bias = true;                                                 \
            /* Compute m = (p << 64) / b */                                 \
            ___m = (~0ULL / ___b) * ___p;                                   \
            ___m += ((~0ULL % ___b + AAAA) * ___p) / ___b;                  \
        } else {                                                            \
            /* Reduce m / p, and try to clear bit 31 of m when possible,    \
             * otherwise that will need extra overflow handling later.      \
             */                                                             \
            uint32_t ___bits = -(___m & -___m);                             \
            ___bits |= ___m >> 32;                                          \
            ___bits = (~___bits) << 1;                                      \
            /* If ___bits == 0 then setting bit 31 is  unavoidable.         \
             * Simply apply the maximum possible reduction in that          \
             * case. Otherwise the MSB of ___bits indicates the             \
             * best reduction we should apply.                              \
             */                                                             \
            if (!___bits) {                                                 \
                ___p /= (___m & -___m);                                     \
                ___m /= (___m & -___m);                                     \
            } else {                                                        \
                BBBB;                                                       \
                CCCC;                                                       \
            }                                                               \
            /* No bias needed. */                                           \
            ___bias = false;                                                \
        }                                                                   \
                                                                            \
        /* Now we have a combination of 2 conditions:                       \
         * 1. whether or not we need to apply a bias, and                   \
         * 2. whether or not there might be an overflow in the cross        \
         *    product determined by (___m & ((1 << 63) | (1 << 31))).       \
         *                                                                  \
         * Select the best way to do (m_bias + m * n) / (1 << 64).          \
         * From now on there will be actual runtime code generated.         \
         */                                                                 \
        ___res = __xprod_64(___m, ___n, ___bias);                           \
                                                                            \
        ___res /= ___p;                                                     \
    })

/*
 * Semantic:  retval = ((bias ? m : 0) + m * n) >> 64
 *
 * The product is a 128-bit value, scaled down to 64 bits.
 * Assuming constant propagation to optimize away unused conditional code.
 */
static inline uint64_t __xprod_64(const uint64_t m, uint64_t n, bool bias)
{
    uint32_t m_lo = m;
    uint32_t m_hi = m >> 32;
    uint32_t n_lo = n;
    uint32_t n_hi = n >> 32;
    uint64_t res;
    uint32_t res_lo, res_hi;

    if (!bias) {
        res = ((uint64_t) m_lo * n_lo) >> 32;
    } else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
        /* it is impossible for an overflow to occur in this case */
        res = (m + (uint64_t) m_lo * n_lo) >> 32;
    } else {
        res = m + (uint64_t) m_lo * n_lo;
        res_lo = res >> 32;
        res_hi = (res_lo < m_hi);
        res = res_lo | ((uint64_t) res_hi << 32);
    }

    if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
        /* it is impossible for an overflow to occur in this case */
        res += (uint64_t) m_lo * n_hi;
        res += (uint64_t) m_hi * n_lo;
        res >>= 32;
    } else {
        res += (uint64_t) m_lo * n_hi;
        uint32_t tmp = res >> 32;
        res += (uint64_t) m_hi * n_lo;
        res_lo = res >> 32;
        res_hi = (res_lo < tmp);
        res = res_lo | ((uint64_t) res_hi << 32);
    }

    res += (uint64_t) m_hi * n_hi;

    return res;
}

static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
{
    uint64_t rem = *n;
    uint64_t b = base;
    uint64_t res, d = 1;
    uint32_t high = rem >> 32;

    /* Reduce the thing a bit first */
    res = 0;
    if (high >= base) {
        high /= base;
        res = (uint64_t) high << 32;
        rem -= (uint64_t) (high * base) << 32;
    }

    while ((int64_t) b > 0 && b < rem) {
        b = b + b;
        d = d + d;
    }

    do {
        if (rem >= b) {
            rem -= b;
            res += d;
        }
        b >>= 1;
        d >>= 1;
    } while (d);

    *n = res;
    return rem;
}

/* The unnecessary pointer compare is there to check for type safety.
 * n must be 64bit.
 */
#define div64(n, base)                                               \
    ({                                                               \
        uint32_t __base = (base);                                    \
        uint32_t __rem;                                              \
        if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
            __rem = (n) & (__base - 1);                              \
            (n) >>= DDDD;                                            \
        } else if (__builtin_constant_p(__base) && __base != 0) {    \
            uint32_t __res_lo, __n_lo = (n);                         \
            (n) = __div64_const32(n, __base);                        \
            /* the remainder can be computed with 32-bit regs */     \
            __res_lo = (n);                                          \
            __rem = __n_lo - __res_lo * __base;                      \
        } else if (likely(((n) >> 32) == 0)) {                       \
            __rem = (uint32_t) (n) % __base;                         \
            (n) = (uint32_t) (n) / __base;                           \
        } else {                                                     \
            __rem = __div64_32(&(n), __base);                        \
        }                                                            \
        __rem;                                                       \
    })

使用案例:

#include <stdio.h>

int main(int argc, char *argv[])
{
    uint64_t res;
    res = argc + 5;
    div64(res, 3);
    printf("%lu\n", res);
}

作答規範:

以最精簡的形式書寫，並依循第一次作業的程式碼風格
AAAA 為整數
BBBB 為包含 ___p 的表示式
CCCC 為包含 ___m 的表示式
BBBB, CCC, DDDD 都該包含 ilog2 的使用

延伸問題:

解釋上述程式碼運作原理
在 Linux 核心原始程式碼找到相關的程式碼並設計實驗 (使用 Linux 核心模組) 探討程式碼的效益和限制

2024q1 第 7 週測驗題

測驗 1

測驗 2

Read more

Linux 的 io_uring

解析 Linux 共享記憶體機制

Linux 核心設計: Scheduler(2): 概述 CFS Scheduler

Linux 核心設計: 不僅是個執行單元的 Process

測驗 `1`

測驗 `2`