Try   HackMD

2022q1 Homework 5 (quiz8)

contributed by < 2020leon >

archive: 2022-04-16 07:34:12 UTC+0

測驗題目

測驗 1

完整程式碼如下。

#include <stddef.h>
#include <stdint.h>
#include <limits.h>
#include <string.h>

/* Nonzero if either X or Y is not aligned on a "long" boundary */
#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))

/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))

/* Threshhold for punting to the bytewise iterator */
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)

#if LONG_MAX == 2147483647L
#define DETECT_NULL(X) (((X) -0x01010101) & ~(X) & 0x80808080)
#else
#if LONG_MAX == 9223372036854775807L
/* Nonzero if X (a long int) contains a NULL byte. */
#define DETECT_NULL(X) (((X) -0x0101010101010101) & ~(X) & 0x8080808080808080)
#else
#error long int is not a 32bit or 64bit type.
#endif
#endif

/* @return nonzero if (long)X contains the byte used to fill MASK. */
#define DETECT_CHAR(X, MASK) (DETECT_NULL(X ^ MASK))

void *memchr_opt(const void *src_void, int c, size_t length)
{
    const unsigned char *src = (const unsigned char *) src_void;
    unsigned char d = c;

    while (UNALIGNED(src)) {
        if (!length--)
            return NULL;
        if (*src == d)
            return (void *) src;
        src++;
    }

    if (!TOO_SMALL(length)) {
        /* If we get this far, we know that length is large and
         * src is word-aligned.
         */

        /* The fast code reads the source one word at a time and only performs
         * the bytewise search on word-sized segments if they contain the search
         * character, which is detected by XORing the word-sized segment with a
         * word-sized block of the search character and then detecting for the
         * presence of NULL in the result.
         */
        unsigned long *asrc = (unsigned long *) src;
        unsigned long mask = d << 8 | d;
        mask = mask << 16 | mask;
        for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
            mask = (mask << i) | mask;

        while (length >= LBLOCKSIZE) {
            /* XXXXX: Your implementation should appear here */
            /* Use macro DETECT_CHAR to check if there contains the byte that
             * used to fill mask. If it returns nonzero, we find the byte we
             * want. Thus, break the loop. 
             */
            if (DETECT_CHAR(*asrc, mask))
                break;
            /* Renew the pointer and length after checking. */
            asrc++;
            length -= LBLOCKSIZE;
        }

        /* If there are fewer than LBLOCKSIZE characters left, then we resort to
         * the bytewise loop.
         */
        src = (unsigned char *) asrc;
    }

    while (length--) {
        if (*src == d)
            return (void *) src;
        src++;
    }

    return NULL;
}

XXXXX 處如下:

if (DETECT_CHAR(*asrc, mask))
    break;
asrc++;
length -= LBLOCKSIZE;

解釋測驗 1 程式碼運作原理

設計實驗觀察隨字串長度和特定 pattern 變化的效能影響

在核心原始程式碼找出 x86_64 或 arm64 對應的最佳化實作


測驗 2

測驗 2 gist

所回答之完整程式碼如下。

static inline ringidx_t cond_reload(ringidx_t idx, const ringidx_t *loc)
{
    ringidx_t fresh = __atomic_load_n(loc, __ATOMIC_RELAXED);
    if (before(idx, fresh)) { /* fresh is after idx, use this instead */
        idx = fresh;
    } else { /* Continue with next slot */
        /* XXXXX */ idx++;
    }
    return idx;
}

static inline ringidx_t find_tail(lfring_t *lfr, ringidx_t head, ringidx_t tail)
{
    if (lfr->flags & LFRING_FLAG_SP) /* single-producer enqueue */
        return __atomic_load_n(&lfr->tail, __ATOMIC_ACQUIRE);

    /* Multi-producer enqueue.
     * Scan ring for new elements that have been written but not released.
     */
    ringidx_t mask = lfr->mask;
    ringidx_t size = /* XXXXX */ mask + 1;
    while (before(tail, head + size) &&
           __atomic_load_n(/* XXXXX */ &lfr->ring[tail & mask].idx) == tail)
        tail++;
    tail = cond_update(&lfr->tail, tail);
    return tail;
}

uint32_t lfring_dequeue(lfring_t *lfr,
                        void **restrict elems,
                        uint32_t n_elems,
                        uint32_t *index)
{
    ringidx_t mask = lfr->mask;
    intptr_t actual;
    ringidx_t head = __atomic_load_n(&lfr->head, __ATOMIC_RELAXED);
    ringidx_t tail = __atomic_load_n(&lfr->tail, __ATOMIC_ACQUIRE);
    do { /* skipped */
    } while (!__atomic_compare_exchange_n(
        &lfr->head, &head, /* Updated on failure */
        /* XXXXX */ head + actual,
        /* weak */ false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
    *index = (uint32_t) head;
    return (uint32_t) actual;
}
陳述式 答案
DDD idx++
KKK mask + 1
TTT &lfr->ring[tail & mask].idx
HHH head + actual

解釋測驗 2 程式碼運作原理

本題以環狀結構實現容量有限的佇列( queue ),用以解決生產者消費者問題。環狀結構可以簡單以下圖表示。

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

source: https://doc.dpdk.org/guides/prog_guide/ring_lib.html

上圖為一切斷的環,以 FIFO 的方式儲存資料。每當有新資料加入時,便會在上圖右側之處加入。若到達邊緣則再從左方持續加入,直到滿為止。而當有資料要從結構中移除時,則從上圖左側移除。

在本題中所對應的結構如下。

struct lfring {
    ringidx_t head;
    ringidx_t tail ALIGNED(CACHE_LINE);
    uint32_t mask;
    uint32_t flags;
    struct element ring[] ALIGNED(CACHE_LINE);
} ALIGNED(CACHE_LINE);

其中 headtail 分別對應到上方左邊以及右邊的箭頭, mask 隱含環狀結構大小之意, flags 則指出生產者消費者模式(分別為 SPSC 、 SPMC 、 MPSC 及 MPMC),最後 ring 則是環狀結構本體。

下方分別理解 lfring_alloc, lfring_dequeuelfring_enqueue

lfring_alloc
lfring_t *lfring_alloc(uint32_t n_elems, uint32_t flags)
{
    unsigned long ringsz = ROUNDUP_POW2(n_elems);
    if (n_elems == 0 || ringsz == 0 || ringsz > 0x80000000) {
        assert(0 && "invalid number of elements");
        return NULL;
    }
    if ((flags & ~SUPPORTED_FLAGS) != 0) {
        assert(0 && "invalid flags");
        return NULL;
    }

    size_t nbytes = sizeof(lfring_t) + ringsz * sizeof(struct element);
    lfring_t *lfr = osal_alloc(nbytes, CACHE_LINE);
    if (!lfr)
        return NULL;

    lfr->head = 0, lfr->tail = 0;
    lfr->mask = ringsz - 1;
    lfr->flags = flags;
    for (ringidx_t i = 0; i < ringsz; i++) {
        lfr->ring[i].ptr = NULL;
        lfr->ring[i].idx = i - ringsz;
    }
    return lfr;
}

lfring_alloc 可視為結構的建構子。由實做可見 mask 的值為 ringsz - 1 ,而 ringsz 又為 2 的冪,因此可驗證上述所言「 mask 隱含環狀結構大小之意」。在此將 ringsz 設為 2 的冪的考量為,索引值可藉由與 mask 進行位元且運算來取代與 ringsz 做取餘運算,精進程式效能。

以 bitwise-AND 取代 modulo 運算不只是「精進」效能,更是確保關鍵操作的執行時間,modulo 和 div 在現代處理器雖然有很大的改進,但沒辦法保證一定在固定的 CPU cycle 內完成

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
jserv

lfring_dequeue
/* Dequeue elements from head */
uint32_t lfring_dequeue(lfring_t *lfr,
                        void **restrict elems,
                        uint32_t n_elems,
                        uint32_t *index)
{
    ringidx_t mask = lfr->mask;
    intptr_t actual;
    ringidx_t head = __atomic_load_n(&lfr->head, __ATOMIC_RELAXED);
    ringidx_t tail = __atomic_load_n(&lfr->tail, __ATOMIC_ACQUIRE);
    do {
        actual = MIN((intptr_t) (tail - head), (intptr_t) n_elems);
        if (UNLIKELY(actual <= 0)) {
            /* Ring buffer is empty, scan for new but unreleased elements */
            tail = find_tail(lfr, head, tail);
            actual = MIN((intptr_t) (tail - head), (intptr_t) n_elems);
            if (actual <= 0)
                return 0;
        }
        for (uint32_t i = 0; i < (uint32_t) actual; i++)
            elems[i] = lfr->ring[(head + i) & mask].ptr;
        smp_fence(LoadStore);                        // Order loads only
        if (UNLIKELY(lfr->flags & LFRING_FLAG_SC)) { /* Single-consumer */
            __atomic_store_n(&lfr->head, head + actual, __ATOMIC_RELAXED);
            break;
        }

        /* else: lock-free multi-consumer */
    } while (!__atomic_compare_exchange_n(
        &lfr->head, &head, /* Updated on failure */
        /* XXXXX HHH */ head + actual,
        /* weak */ false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
    *index = (uint32_t) head;
    return (uint32_t) actual;
}

lfring_dequeue 首先檢查仍可利用的空間。若沒有任何可用空間,在單生產者模式下便會直接返回;若在多生產者模式下,由於 tail 的值會被多個生產者更改,在前方所快取的 tail 變數不一定為最新的 tail ,因此會利用 find_tail() 再次取得最新的 tail ,若仍無空間便返回。在此之後, actual 的值便會確定。 actual 為一紀錄實際可取元素數的變數。

接下來便來到內部的 for 迴圈,一一將位於環中 head 位置的元素取出並存於 elems 中。

最後,在單消費者模式下,由於只會有現在的緒更動環的 head ,因此可在更動環的 head 後直接進行收尾。在多消費者模式下,因為會有多個消費者更動環的 head ,因此需在主邏輯結束後檢查所快取的 head 是否同環的 head ,若是則收尾,否則更新快取的 head 後,再次從檢查仍可利用的空間開始。

lfring_enqueue
/* Enqueue elements at tail */
uint32_t lfring_enqueue(lfring_t *lfr,
                        void *const *restrict elems,
                        uint32_t n_elems)
{
    intptr_t actual = 0;
    ringidx_t mask = lfr->mask;
    ringidx_t size = mask + 1;
    ringidx_t tail = __atomic_load_n(&lfr->tail, __ATOMIC_RELAXED);

    if (lfr->flags & LFRING_FLAG_SP) { /* single-producer */
        ringidx_t head = __atomic_load_n(&lfr->head, __ATOMIC_ACQUIRE);
        actual = MIN((intptr_t) (head + size - tail), (intptr_t) n_elems);
        if (actual <= 0)
            return 0;

        for (uint32_t i = 0; i < (uint32_t) actual; i++) {
            assert(lfr->ring[tail & mask].idx == tail - size);
            lfr->ring[tail & mask].ptr = *elems++;
            lfr->ring[tail & mask].idx = tail;
            tail++;
        }
        __atomic_store_n(&lfr->tail, tail, __ATOMIC_RELEASE);
        return (uint32_t) actual;
    }

    /* else: lock-free multi-producer */
restart:
    while ((uint32_t) actual < n_elems &&
           before(tail, __atomic_load_n(&lfr->head, __ATOMIC_ACQUIRE) + size)) {
        union {
            struct element e;
            ptrpair_t pp;
        } old, neu;
        void *elem = elems[actual];
        struct element *slot = &lfr->ring[tail & mask];
        old.e.ptr = __atomic_load_n(&slot->ptr, __ATOMIC_RELAXED);
        old.e.idx = __atomic_load_n(&slot->idx, __ATOMIC_RELAXED);
        do {
            if (UNLIKELY(old.e.idx != tail - size)) {
                if (old.e.idx != tail) {
                    /* We are far behind. Restart with fresh index */
                    tail = cond_reload(tail, &lfr->tail);
                    goto restart;
                }
                /* slot already enqueued */
                tail++; /* Try next slot */
                goto restart;
            }

            /* Found slot that was used one lap back.
             * Try to enqueue next element.
             */
            neu.e.ptr = elem;
            neu.e.idx = tail; /* Set idx on enqueue */
        } while (!lf_compare_exchange((ptrpair_t *) slot, &old.pp, neu.pp));

        /* Enqueue succeeded */
        actual++;
        tail++; /* Continue with next slot */
    }
    (void) cond_update(&lfr->tail, tail);
    return (uint32_t) actual;
}

lfring_enqueue 針對單生產者和多生產者有著不同的策略。若為單生產者,則確認是否有空間插入資料,若無則返回,若有則依據目前最大容量依序將欲插入之資料放入環中,更新環的 tail 後返回。

若為多生產者模式,亦先確認是否有空間插入資料,若無空間則返回。若有空間,由於多個生產者會更動環的 tail ,因此需檢查快取的 tail 所對應的 slot 是否為空。若所對應的 slot 不為空的話,則依據不為空的原因來更新快取的 tail :如是因為其他的生產者插入少許新資料導致,則更新快取的 tail 以查看下一個 slot 的狀態;如是因為其他的生產者插入多個新資料導致,則更新快取的 tail 以查看最新的 tail 所對應之 slot 的狀態。在此,「多個」係指其他生產者所插入的資料數大於等於環的空間。

在確保現在快取的 tail 對應之 slot 為空後,最後再經由 lf_compare_exchange() 確保沒有任何其他生產者對於該 slot 插入資料才實際更新將資料插入環中。


測驗 3

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/list.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/workqueue.h>

MODULE_AUTHOR("National Cheng Kung University, Taiwan");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_DESCRIPTION("A kernel module that kills ptrace tracer and its tracees");

#define JIFFIES_DELAY 1
#define DONT_TRACE_WQ_NAME "dont_trace_worker"

static void periodic_routine(struct work_struct *);
static DECLARE_DELAYED_WORK(dont_trace_task, periodic_routine);
static struct workqueue_struct *wq;
static bool loaded;

/* Send SIGKILL from kernel space */
static void kill_task(struct task_struct *task)
{
    send_sig(SIGKILL, task, 1);
}

/* @return true if the process has tracees */
static bool is_tracer(struct list_head *children)
{
    struct list_head *list;
    list_for_each (list, children) {
        struct task_struct *task =
            list_entry(list, struct task_struct, ptrace_entry);
        if (task)
            return true;
    }
    return false;
}

/* Traverse the element in the linked list of the ptraced proccesses and
 * finally kills them.
 */
static void kill_tracee(struct list_head *children)
{
    struct list_head *list;
    list_for_each (list, children) {
        struct task_struct *task_ptraced =
            list_entry(list, struct task_struct, ptrace_entry);
        pr_info("ptracee -> comm: %s, pid: %d, gid: %d, ptrace: %d\n",
                task_ptraced->comm, task_ptraced->pid, task_ptraced->tgid,
                task_ptraced->ptrace);
        kill_task(task_ptraced);
    }
}

static void check(void)
{
    struct task_struct *task;
    for_each_process (task) {
        if (!is_tracer(&task->ptraced))
            continue;

        kill_tracee(&task->ptraced);
        kill_task(task); /* Kill the tracer once all tracees are killed */
    }
}

static void periodic_routine(struct work_struct *ws)
{
    /* Check if the module is loaded. */
    if (likely(/* XXXXX: Implement */loaded))
        check();
    /* XXXXX: Implement */
    /* Put work task in global workqueue after JIFFIES_DELAY */
    schedule_delayed_work(&dont_trace_task, JIFFIES_DELAY);
}

static int __init dont_trace_init(void)
{
    wq = create_workqueue(DONT_TRACE_WQ_NAME);
    queue_delayed_work(wq, &dont_trace_task, JIFFIES_DELAY);

    loaded = true;
    pr_info("Loaded!\n");
    return 0;
}

static void __exit dont_trace_exit(void)
{
    loaded = false;

    /* No new routines will be queued */
    cancel_delayed_work(&dont_trace_task);

    /* Wait for the completion of all routines */
    flush_workqueue(wq);
    destroy_workqueue(wq);

    pr_info("Unloaded.\n");
}

module_init(dont_trace_init);
module_exit(dont_trace_exit);

程式碼運作原理

當 module 被載入 kernel 後,便會執行 dont_trace_init 函式。其功能為將已經初始化的 dont_trace_task 加入全域的 workqueue 中,並在 JIFFIES_DELAY 後執行所指定的函式,在此即 periodic_routine

periodic_routine 的功能為利用 check() 刪除所有 tracer 和 tracee ,並再次將 dont_trace_task 加入全域的 workqueue 中,待 JIFFIES_DELAY 後再執行自己。

check 和其所呼叫的函式即為本模組的關鍵所在。其中 is_tracer 函式用於檢查 process 是否有追蹤其他的 process 。其技巧為檢查 task->ptraced 中是否有元素被繼承為 struct task_struct 。在確定該 process 為 tracer 後,則藉 kill_tracee(&task->ptraced)kill_task(task) 立即中止其所有 tracee 和該 process 。

kill_task 發送中止訊號 SIGKILL 至所指定的 task 中。其呼叫的函式 send_sig 之第三個引數是指「不要忽略由 kernel 所發出之訊號」。

探討行程避免被追蹤的手法

該文章中所提及的其中一個方法為設置 ptrace_scope ,將 ptrace_scope 設為最高級別的 3 可避免任何人的追蹤。