2022q1 Homework 5 (quiz8)

contributed by < 2020leon >

archive: 2022-04-16 07:34:12 UTC+0

測驗題目

測驗 `1`

完整程式碼如下。

#include <stddef.h>
#include <stdint.h>
#include <limits.h>
#include <string.h>

/* Nonzero if either X or Y is not aligned on a "long" boundary */
#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))

/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))

/* Threshhold for punting to the bytewise iterator */
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)

#if LONG_MAX == 2147483647L
#define DETECT_NULL(X) (((X) -0x01010101) & ~(X) & 0x80808080)
#else
#if LONG_MAX == 9223372036854775807L
/* Nonzero if X (a long int) contains a NULL byte. */
#define DETECT_NULL(X) (((X) -0x0101010101010101) & ~(X) & 0x8080808080808080)
#else
#error long int is not a 32bit or 64bit type.
#endif
#endif

/* @return nonzero if (long)X contains the byte used to fill MASK. */
#define DETECT_CHAR(X, MASK) (DETECT_NULL(X ^ MASK))

void *memchr_opt(const void *src_void, int c, size_t length)
{
    const unsigned char *src = (const unsigned char *) src_void;
    unsigned char d = c;

    while (UNALIGNED(src)) {
        if (!length--)
            return NULL;
        if (*src == d)
            return (void *) src;
        src++;
    }

    if (!TOO_SMALL(length)) {
        /* If we get this far, we know that length is large and
         * src is word-aligned.
         */

        /* The fast code reads the source one word at a time and only performs
         * the bytewise search on word-sized segments if they contain the search
         * character, which is detected by XORing the word-sized segment with a
         * word-sized block of the search character and then detecting for the
         * presence of NULL in the result.
         */
        unsigned long *asrc = (unsigned long *) src;
        unsigned long mask = d << 8 | d;
        mask = mask << 16 | mask;
        for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
            mask = (mask << i) | mask;

        while (length >= LBLOCKSIZE) {
            /* XXXXX: Your implementation should appear here */
            /* Use macro DETECT_CHAR to check if there contains the byte that
             * used to fill mask. If it returns nonzero, we find the byte we
             * want. Thus, break the loop. 
             */
            if (DETECT_CHAR(*asrc, mask))
                break;
            /* Renew the pointer and length after checking. */
            asrc++;
            length -= LBLOCKSIZE;
        }

        /* If there are fewer than LBLOCKSIZE characters left, then we resort to
         * the bytewise loop.
         */
        src = (unsigned char *) asrc;
    }

    while (length--) {
        if (*src == d)
            return (void *) src;
        src++;
    }

    return NULL;
}

XXXXX 處如下：

if (DETECT_CHAR(*asrc, mask))
    break;
asrc++;
length -= LBLOCKSIZE;

解釋測驗 `1` 程式碼運作原理

設計實驗觀察隨字串長度和特定 pattern 變化的效能影響

在核心原始程式碼找出 x86_64 或 arm64 對應的最佳化實作

測驗 `2`

測驗 2 gist

所回答之完整程式碼如下。

static inline ringidx_t cond_reload(ringidx_t idx, const ringidx_t *loc)
{
    ringidx_t fresh = __atomic_load_n(loc, __ATOMIC_RELAXED);
    if (before(idx, fresh)) { /* fresh is after idx, use this instead */
        idx = fresh;
    } else { /* Continue with next slot */
        /* XXXXX */ idx++;
    }
    return idx;
}

static inline ringidx_t find_tail(lfring_t *lfr, ringidx_t head, ringidx_t tail)
{
    if (lfr->flags & LFRING_FLAG_SP) /* single-producer enqueue */
        return __atomic_load_n(&lfr->tail, __ATOMIC_ACQUIRE);

    /* Multi-producer enqueue.
     * Scan ring for new elements that have been written but not released.
     */
    ringidx_t mask = lfr->mask;
    ringidx_t size = /* XXXXX */ mask + 1;
    while (before(tail, head + size) &&
           __atomic_load_n(/* XXXXX */ &lfr->ring[tail & mask].idx) == tail)
        tail++;
    tail = cond_update(&lfr->tail, tail);
    return tail;
}

uint32_t lfring_dequeue(lfring_t *lfr,
                        void **restrict elems,
                        uint32_t n_elems,
                        uint32_t *index)
{
    ringidx_t mask = lfr->mask;
    intptr_t actual;
    ringidx_t head = __atomic_load_n(&lfr->head, __ATOMIC_RELAXED);
    ringidx_t tail = __atomic_load_n(&lfr->tail, __ATOMIC_ACQUIRE);
    do { /* skipped */
    } while (!__atomic_compare_exchange_n(
        &lfr->head, &head, /* Updated on failure */
        /* XXXXX */ head + actual,
        /* weak */ false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
    *index = (uint32_t) head;
    return (uint32_t) actual;
}

陳述式	答案
`DDD`	`idx++`
`KKK`	`mask + 1`
`TTT`	`&lfr->ring[tail & mask].idx`
`HHH`	`head + actual`

解釋測驗 `2` 程式碼運作原理

本題以環狀結構實現容量有限的佇列（ queue ），用以解決生產者消費者問題。環狀結構可以簡單以下圖表示。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

source: https://doc.dpdk.org/guides/prog_guide/ring_lib.html

上圖為一切斷的環，以 FIFO 的方式儲存資料。每當有新資料加入時，便會在上圖右側之處加入。若到達邊緣則再從左方持續加入，直到滿為止。而當有資料要從結構中移除時，則從上圖左側移除。

在本題中所對應的結構如下。

struct lfring {
    ringidx_t head;
    ringidx_t tail ALIGNED(CACHE_LINE);
    uint32_t mask;
    uint32_t flags;
    struct element ring[] ALIGNED(CACHE_LINE);
} ALIGNED(CACHE_LINE);

其中 head 和 tail 分別對應到上方左邊以及右邊的箭頭， mask 隱含環狀結構大小之意， flags 則指出生產者消費者模式（分別為 SPSC 、 SPMC 、 MPSC 及 MPMC），最後 ring 則是環狀結構本體。

下方分別理解 lfring_alloc, lfring_dequeue 和 lfring_enqueue 。

`lfring_alloc`

lfring_t *lfring_alloc(uint32_t n_elems, uint32_t flags)
{
    unsigned long ringsz = ROUNDUP_POW2(n_elems);
    if (n_elems == 0 || ringsz == 0 || ringsz > 0x80000000) {
        assert(0 && "invalid number of elements");
        return NULL;
    }
    if ((flags & ~SUPPORTED_FLAGS) != 0) {
        assert(0 && "invalid flags");
        return NULL;
    }

    size_t nbytes = sizeof(lfring_t) + ringsz * sizeof(struct element);
    lfring_t *lfr = osal_alloc(nbytes, CACHE_LINE);
    if (!lfr)
        return NULL;

    lfr->head = 0, lfr->tail = 0;
    lfr->mask = ringsz - 1;
    lfr->flags = flags;
    for (ringidx_t i = 0; i < ringsz; i++) {
        lfr->ring[i].ptr = NULL;
        lfr->ring[i].idx = i - ringsz;
    }
    return lfr;
}

lfring_alloc 可視為結構的建構子。由實做可見 mask 的值為 ringsz - 1 ，而 ringsz 又為 2 的冪，因此可驗證上述所言「 mask 隱含環狀結構大小之意」。在此將 ringsz 設為 2 的冪的考量為，索引值可藉由與 mask 進行位元且運算來取代與 ringsz 做取餘運算，精進程式效能。

以 bitwise-AND 取代 modulo 運算不只是「精進」效能，更是確保關鍵操作的執行時間，modulo 和 div 在現代處理器雖然有很大的改進，但沒辦法保證一定在固定的 CPU cycle 內完成

Image Not Showing Possible Reasons
The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported
Learn More →

jserv

`lfring_dequeue`

/* Dequeue elements from head */
uint32_t lfring_dequeue(lfring_t *lfr,
                        void **restrict elems,
                        uint32_t n_elems,
                        uint32_t *index)
{
    ringidx_t mask = lfr->mask;
    intptr_t actual;
    ringidx_t head = __atomic_load_n(&lfr->head, __ATOMIC_RELAXED);
    ringidx_t tail = __atomic_load_n(&lfr->tail, __ATOMIC_ACQUIRE);
    do {
        actual = MIN((intptr_t) (tail - head), (intptr_t) n_elems);
        if (UNLIKELY(actual <= 0)) {
            /* Ring buffer is empty, scan for new but unreleased elements */
            tail = find_tail(lfr, head, tail);
            actual = MIN((intptr_t) (tail - head), (intptr_t) n_elems);
            if (actual <= 0)
                return 0;
        }
        for (uint32_t i = 0; i < (uint32_t) actual; i++)
            elems[i] = lfr->ring[(head + i) & mask].ptr;
        smp_fence(LoadStore);                        // Order loads only
        if (UNLIKELY(lfr->flags & LFRING_FLAG_SC)) { /* Single-consumer */
            __atomic_store_n(&lfr->head, head + actual, __ATOMIC_RELAXED);
            break;
        }

        /* else: lock-free multi-consumer */
    } while (!__atomic_compare_exchange_n(
        &lfr->head, &head, /* Updated on failure */
        /* XXXXX HHH */ head + actual,
        /* weak */ false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
    *index = (uint32_t) head;
    return (uint32_t) actual;
}

lfring_dequeue 首先檢查仍可利用的空間。若沒有任何可用空間，在單生產者模式下便會直接返回；若在多生產者模式下，由於 tail 的值會被多個生產者更改，在前方所快取的 tail 變數不一定為最新的 tail ，因此會利用 find_tail() 再次取得最新的 tail ，若仍無空間便返回。在此之後， actual 的值便會確定。 actual 為一紀錄實際可取元素數的變數。

接下來便來到內部的 for 迴圈，一一將位於環中 head 位置的元素取出並存於 elems 中。

最後，在單消費者模式下，由於只會有現在的緒更動環的 head ，因此可在更動環的 head 後直接進行收尾。在多消費者模式下，因為會有多個消費者更動環的 head ，因此需在主邏輯結束後檢查所快取的 head 是否同環的 head ，若是則收尾，否則更新快取的 head 後，再次從檢查仍可利用的空間開始。

`lfring_enqueue`

/* Enqueue elements at tail */
uint32_t lfring_enqueue(lfring_t *lfr,
                        void *const *restrict elems,
                        uint32_t n_elems)
{
    intptr_t actual = 0;
    ringidx_t mask = lfr->mask;
    ringidx_t size = mask + 1;
    ringidx_t tail = __atomic_load_n(&lfr->tail, __ATOMIC_RELAXED);

    if (lfr->flags & LFRING_FLAG_SP) { /* single-producer */
        ringidx_t head = __atomic_load_n(&lfr->head, __ATOMIC_ACQUIRE);
        actual = MIN((intptr_t) (head + size - tail), (intptr_t) n_elems);
        if (actual <= 0)
            return 0;

        for (uint32_t i = 0; i < (uint32_t) actual; i++) {
            assert(lfr->ring[tail & mask].idx == tail - size);
            lfr->ring[tail & mask].ptr = *elems++;
            lfr->ring[tail & mask].idx = tail;
            tail++;
        }
        __atomic_store_n(&lfr->tail, tail, __ATOMIC_RELEASE);
        return (uint32_t) actual;
    }

    /* else: lock-free multi-producer */
restart:
    while ((uint32_t) actual < n_elems &&
           before(tail, __atomic_load_n(&lfr->head, __ATOMIC_ACQUIRE) + size)) {
        union {
            struct element e;
            ptrpair_t pp;
        } old, neu;
        void *elem = elems[actual];
        struct element *slot = &lfr->ring[tail & mask];
        old.e.ptr = __atomic_load_n(&slot->ptr, __ATOMIC_RELAXED);
        old.e.idx = __atomic_load_n(&slot->idx, __ATOMIC_RELAXED);
        do {
            if (UNLIKELY(old.e.idx != tail - size)) {
                if (old.e.idx != tail) {
                    /* We are far behind. Restart with fresh index */
                    tail = cond_reload(tail, &lfr->tail);
                    goto restart;
                }
                /* slot already enqueued */
                tail++; /* Try next slot */
                goto restart;
            }

            /* Found slot that was used one lap back.
             * Try to enqueue next element.
             */
            neu.e.ptr = elem;
            neu.e.idx = tail; /* Set idx on enqueue */
        } while (!lf_compare_exchange((ptrpair_t *) slot, &old.pp, neu.pp));

        /* Enqueue succeeded */
        actual++;
        tail++; /* Continue with next slot */
    }
    (void) cond_update(&lfr->tail, tail);
    return (uint32_t) actual;
}

lfring_enqueue 針對單生產者和多生產者有著不同的策略。若為單生產者，則確認是否有空間插入資料，若無則返回，若有則依據目前最大容量依序將欲插入之資料放入環中，更新環的 tail 後返回。

若為多生產者模式，亦先確認是否有空間插入資料，若無空間則返回。若有空間，由於多個生產者會更動環的 tail ，因此需檢查快取的 tail 所對應的 slot 是否為空。若所對應的 slot 不為空的話，則依據不為空的原因來更新快取的 tail ：如是因為其他的生產者插入少許新資料導致，則更新快取的 tail 以查看下一個 slot 的狀態；如是因為其他的生產者插入多個新資料導致，則更新快取的 tail 以查看最新的 tail 所對應之 slot 的狀態。在此，「多個」係指其他生產者所插入的資料數大於等於環的空間。

在確保現在快取的 tail 對應之 slot 為空後，最後再經由 lf_compare_exchange() 確保沒有任何其他生產者對於該 slot 插入資料才實際更新將資料插入環中。

測驗 `3`

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/list.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/workqueue.h>

MODULE_AUTHOR("National Cheng Kung University, Taiwan");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_DESCRIPTION("A kernel module that kills ptrace tracer and its tracees");

#define JIFFIES_DELAY 1
#define DONT_TRACE_WQ_NAME "dont_trace_worker"

static void periodic_routine(struct work_struct *);
static DECLARE_DELAYED_WORK(dont_trace_task, periodic_routine);
static struct workqueue_struct *wq;
static bool loaded;

/* Send SIGKILL from kernel space */
static void kill_task(struct task_struct *task)
{
    send_sig(SIGKILL, task, 1);
}

/* @return true if the process has tracees */
static bool is_tracer(struct list_head *children)
{
    struct list_head *list;
    list_for_each (list, children) {
        struct task_struct *task =
            list_entry(list, struct task_struct, ptrace_entry);
        if (task)
            return true;
    }
    return false;
}

/* Traverse the element in the linked list of the ptraced proccesses and
 * finally kills them.
 */
static void kill_tracee(struct list_head *children)
{
    struct list_head *list;
    list_for_each (list, children) {
        struct task_struct *task_ptraced =
            list_entry(list, struct task_struct, ptrace_entry);
        pr_info("ptracee -> comm: %s, pid: %d, gid: %d, ptrace: %d\n",
                task_ptraced->comm, task_ptraced->pid, task_ptraced->tgid,
                task_ptraced->ptrace);
        kill_task(task_ptraced);
    }
}

static void check(void)
{
    struct task_struct *task;
    for_each_process (task) {
        if (!is_tracer(&task->ptraced))
            continue;

        kill_tracee(&task->ptraced);
        kill_task(task); /* Kill the tracer once all tracees are killed */
    }
}

static void periodic_routine(struct work_struct *ws)
{
    /* Check if the module is loaded. */
    if (likely(/* XXXXX: Implement */loaded))
        check();
    /* XXXXX: Implement */
    /* Put work task in global workqueue after JIFFIES_DELAY */
    schedule_delayed_work(&dont_trace_task, JIFFIES_DELAY);
}

static int __init dont_trace_init(void)
{
    wq = create_workqueue(DONT_TRACE_WQ_NAME);
    queue_delayed_work(wq, &dont_trace_task, JIFFIES_DELAY);

    loaded = true;
    pr_info("Loaded!\n");
    return 0;
}

static void __exit dont_trace_exit(void)
{
    loaded = false;

    /* No new routines will be queued */
    cancel_delayed_work(&dont_trace_task);

    /* Wait for the completion of all routines */
    flush_workqueue(wq);
    destroy_workqueue(wq);

    pr_info("Unloaded.\n");
}

module_init(dont_trace_init);
module_exit(dont_trace_exit);

程式碼運作原理

當 module 被載入 kernel 後，便會執行 dont_trace_init 函式。其功能為將已經初始化的 dont_trace_task 加入全域的 workqueue 中，並在 JIFFIES_DELAY 後執行所指定的函式，在此即 periodic_routine 。

periodic_routine 的功能為利用 check() 刪除所有 tracer 和 tracee ，並再次將 dont_trace_task 加入全域的 workqueue 中，待 JIFFIES_DELAY 後再執行自己。

check 和其所呼叫的函式即為本模組的關鍵所在。其中 is_tracer 函式用於檢查 process 是否有追蹤其他的 process 。其技巧為檢查 task->ptraced 中是否有元素被繼承為 struct task_struct 。在確定該 process 為 tracer 後，則藉 kill_tracee(&task->ptraced) 和 kill_task(task) 立即中止其所有 tracee 和該 process 。

kill_task 發送中止訊號 SIGKILL 至所指定的 task 中。其呼叫的函式 send_sig 之第三個引數是指「不要忽略由 kernel 所發出之訊號」。

探討行程避免被追蹤的手法

在該文章中所提及的其中一個方法為設置 ptrace_scope ，將 ptrace_scope 設為最高級別的 3 可避免任何人的追蹤。

2022q1 Homework 5 (quiz8)

測驗題目

測驗 1

解釋測驗 1 程式碼運作原理

設計實驗觀察隨字串長度和特定 pattern 變化的效能影響

在核心原始程式碼找出 x86_64 或 arm64 對應的最佳化實作

測驗 2

解釋測驗 2 程式碼運作原理

lfring_alloc

lfring_dequeue

lfring_enqueue

測驗 3