Try   HackMD

2022-04-04 cwl0429

測驗 1

這題目的目標是將 memchr 改寫成 SWAR 版本

void *memchr_opt(const void *src_void, int c, size_t length)
{
    const unsigned char *src = (const unsigned char *) src_void;
    unsigned char d = c;
    while (UNALIGNED(src)) {
        if (!length--)
            return NULL;
        if (*src == d)
            return (void *) src;
        src++;
    }

    if (!TOO_SMALL(length)) {
        /* If we get this far, we know that length is large and
         * src is word-aligned.
         */

        /* The fast code reads the source one word at a time and only performs
         * the bytewise search on word-sized segments if they contain the search
         * character, which is detected by XORing the word-sized segment with a
         * word-sized block of the search character and then detecting for the
         * presence of NULL in the result.
         */
        unsigned long *asrc = (unsigned long *) src;
        unsigned long mask = d << 8 | d; // 16 bits mask (include two character)
        mask = mask << 16 | mask;   // 32 bits mask
        for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1) // 64 bits mask
            mask = (mask << i) | mask;

        while (length >= LBLOCKSIZE) {
            /* XXXXX: Your implementation should appear here */
            unsigned long cmp = DETECT_CHAR(*asrc, mask); 
            if (cmp) {
                src = (unsigned char *) asrc;
                src += (__builtin_clzl(cmp) >> 3) - 1; // convert bits into bytes
                return (void *) src;
            }
            length -= LBLOCKSIZE;
            asrc += 1;
        }

        /* If there are fewer than LBLOCKSIZE characters left, then we resort to
         * the bytewise loop.
         */
        src = (unsigned char *) asrc;
    }

    while (length--) {
        if (*src == d)
            return (void *) src;
        src++;
    }

    return NULL;
}

主要新增此段程式碼,想法是利用老師提供的 DETECT_CHAR 巨集找出此 64 bits 或 32 bits 內使否存在目標 character

  • 若是有則將 src 指向目標 character 所在的地址並回傳
  • 否則將 length 更新並將 asrc 指到下一個 unsigned long 所在地址
while (length >= LBLOCKSIZE) {
            /* XXXXX: Your implementation should appear here */
            unsigned long cmp = DETECT_CHAR(*asrc, mask);
            if (cmp) {
                src = (unsigned char *) asrc;
                src += (__builtin_clzl(cmp) >> 3) - 1; // convert bits into bytes
                return (void *) src;
            }
            length -= LBLOCKSIZE;
            asrc += 1;
        }

測驗 2

此 function 的用途是 reload idx

  • 若是 fresh 領先 idx 則將 idx 替換成 fresh
  • 否則就使用接續的下一個 idx
static inline ringidx_t cond_reload(ringidx_t idx, const ringidx_t *loc)
{
    ringidx_t fresh = __atomic_load_n(loc, __ATOMIC_RELAXED);
    if (before(idx, fresh)) { /* fresh is after idx, use this instead */
        idx = fresh;
    } else { /* Continue with next slot */
	/* XXXXX */ idx++;
    }
    return idx;
}

這邊需要填入 KKKTTT

  • KKK 需要填入正確的 size
  • TTT__atomic_load_n 的參數,用載入 tail 並和先前載入的 tail 做比對
static inline ringidx_t find_tail(lfring_t *lfr, ringidx_t head, ringidx_t tail)
{
    if (lfr->flags & LFRING_FLAG_SP) /* single-producer enqueue */
        return __atomic_load_n(&lfr->tail, __ATOMIC_ACQUIRE);

    /* Multi-producer enqueue.
     * Scan ring for new elements that have been written but not released.
     */
    ringidx_t mask = lfr->mask;
    ringidx_t size = /* XXXXX KKK*/ mask - 1;
    while (before(tail, head + size) &&
           __atomic_load_n(/* XXXXX TTT*/ &lfr->tail, __ATOMIC_RELAXED) ==
               tail)
        tail++;
    tail = cond_update(&lfr->tail, tail);
    return tail;
}

此處要填入 HHH

  • HHH 是 __atomic_compare_exchange_n 的 desired 部份,可以看出此處需要更新 head 的數值
uint32_t lfring_dequeue(lfring_t *lfr,
                        void **restrict elems,
                        uint32_t n_elems,
                        uint32_t *index)
{
    ringidx_t mask = lfr->mask;
    intptr_t actual;
    ringidx_t head = __atomic_load_n(&lfr->head, __ATOMIC_RELAXED);
    ringidx_t tail = __atomic_load_n(&lfr->tail, __ATOMIC_ACQUIRE);
    do {
        actual = MIN((intptr_t)(tail - head), (intptr_t) n_elems);
        if (UNLIKELY(actual <= 0)) {
            /* Ring buffer is empty, scan for new but unreleased elements */
            tail = find_tail(lfr, head, tail);
            actual = MIN((intptr_t)(tail - head), (intptr_t) n_elems);
            if (actual <= 0)
                return 0;
        }
        for (uint32_t i = 0; i < (uint32_t) actual; i++)
            elems[i] = lfr->ring[(head + i) & mask].ptr;
        smp_fence(LoadStore);                        // Order loads only
        if (UNLIKELY(lfr->flags & LFRING_FLAG_SC)) { /* Single-consumer */
            __atomic_store_n(&lfr->head, head + actual, __ATOMIC_RELAXED);
            break;
        }

        /* else: lock-free multi-consumer */
    } while (!__atomic_compare_exchange_n(
        &lfr->head, &head, /* Updated on failure */
        /* XXXXX HHH*/ lfr->head + 1,
        /* weak */ false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); //__atomic_compare_exchange_n (ptr, expected, desired, ...)
    *index = (uint32_t) head;
    return (uint32_t) actual;
}

測驗 3

研讀中..