2022-04-04
cwl04291
這題目的目標是將 memchr 改寫成 SWAR 版本
void *memchr_opt(const void *src_void, int c, size_t length)
{
const unsigned char *src = (const unsigned char *) src_void;
unsigned char d = c;
while (UNALIGNED(src)) {
if (!length--)
return NULL;
if (*src == d)
return (void *) src;
src++;
}
if (!TOO_SMALL(length)) {
/* If we get this far, we know that length is large and
* src is word-aligned.
*/
/* The fast code reads the source one word at a time and only performs
* the bytewise search on word-sized segments if they contain the search
* character, which is detected by XORing the word-sized segment with a
* word-sized block of the search character and then detecting for the
* presence of NULL in the result.
*/
unsigned long *asrc = (unsigned long *) src;
unsigned long mask = d << 8 | d; // 16 bits mask (include two character)
mask = mask << 16 | mask; // 32 bits mask
for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1) // 64 bits mask
mask = (mask << i) | mask;
while (length >= LBLOCKSIZE) {
/* XXXXX: Your implementation should appear here */
unsigned long cmp = DETECT_CHAR(*asrc, mask);
if (cmp) {
src = (unsigned char *) asrc;
src += (__builtin_clzl(cmp) >> 3) - 1; // convert bits into bytes
return (void *) src;
}
length -= LBLOCKSIZE;
asrc += 1;
}
/* If there are fewer than LBLOCKSIZE characters left, then we resort to
* the bytewise loop.
*/
src = (unsigned char *) asrc;
}
while (length--) {
if (*src == d)
return (void *) src;
src++;
}
return NULL;
}
主要新增此段程式碼,想法是利用老師提供的 DETECT_CHAR
巨集找出此 64 bits 或 32 bits 內使否存在目標 character
src
指向目標 character 所在的地址並回傳length
更新並將 asrc
指到下一個 unsigned long 所在地址while (length >= LBLOCKSIZE) {
/* XXXXX: Your implementation should appear here */
unsigned long cmp = DETECT_CHAR(*asrc, mask);
if (cmp) {
src = (unsigned char *) asrc;
src += (__builtin_clzl(cmp) >> 3) - 1; // convert bits into bytes
return (void *) src;
}
length -= LBLOCKSIZE;
asrc += 1;
}
2
此 function 的用途是 reload idx
fresh
領先 idx
則將 idx
替換成 fresh
idx
static inline ringidx_t cond_reload(ringidx_t idx, const ringidx_t *loc)
{
ringidx_t fresh = __atomic_load_n(loc, __ATOMIC_RELAXED);
if (before(idx, fresh)) { /* fresh is after idx, use this instead */
idx = fresh;
} else { /* Continue with next slot */
/* XXXXX */ idx++;
}
return idx;
}
這邊需要填入 KKK
及 TTT
KKK
需要填入正確的 size
TTT
是 __atomic_load_n
的參數,用載入 tail
並和先前載入的 tail
做比對static inline ringidx_t find_tail(lfring_t *lfr, ringidx_t head, ringidx_t tail)
{
if (lfr->flags & LFRING_FLAG_SP) /* single-producer enqueue */
return __atomic_load_n(&lfr->tail, __ATOMIC_ACQUIRE);
/* Multi-producer enqueue.
* Scan ring for new elements that have been written but not released.
*/
ringidx_t mask = lfr->mask;
ringidx_t size = /* XXXXX KKK*/ mask - 1;
while (before(tail, head + size) &&
__atomic_load_n(/* XXXXX TTT*/ &lfr->tail, __ATOMIC_RELAXED) ==
tail)
tail++;
tail = cond_update(&lfr->tail, tail);
return tail;
}
此處要填入 HHH
HHH
是 __atomic_compare_exchange_n 的 desired
部份,可以看出此處需要更新 head
的數值uint32_t lfring_dequeue(lfring_t *lfr,
void **restrict elems,
uint32_t n_elems,
uint32_t *index)
{
ringidx_t mask = lfr->mask;
intptr_t actual;
ringidx_t head = __atomic_load_n(&lfr->head, __ATOMIC_RELAXED);
ringidx_t tail = __atomic_load_n(&lfr->tail, __ATOMIC_ACQUIRE);
do {
actual = MIN((intptr_t)(tail - head), (intptr_t) n_elems);
if (UNLIKELY(actual <= 0)) {
/* Ring buffer is empty, scan for new but unreleased elements */
tail = find_tail(lfr, head, tail);
actual = MIN((intptr_t)(tail - head), (intptr_t) n_elems);
if (actual <= 0)
return 0;
}
for (uint32_t i = 0; i < (uint32_t) actual; i++)
elems[i] = lfr->ring[(head + i) & mask].ptr;
smp_fence(LoadStore); // Order loads only
if (UNLIKELY(lfr->flags & LFRING_FLAG_SC)) { /* Single-consumer */
__atomic_store_n(&lfr->head, head + actual, __ATOMIC_RELAXED);
break;
}
/* else: lock-free multi-consumer */
} while (!__atomic_compare_exchange_n(
&lfr->head, &head, /* Updated on failure */
/* XXXXX HHH*/ lfr->head + 1,
/* weak */ false, __ATOMIC_RELAXED, __ATOMIC_RELAXED)); //__atomic_compare_exchange_n (ptr, expected, desired, ...)
*index = (uint32_t) head;
return (uint32_t) actual;
}
3
研讀中..
資料內容 原先的資料 資料筆數4925571 筆 資料欄位 租借時長 租車日期 租車時間 第幾個小時租車的
Jun 10, 2023contributed by < cwl0429 > $ lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 39 bits physical, 48 bits virtual CPU(s): 8 On-line CPU(s) list: 0-7 Thread(s) per core: 2
Oct 31, 2022contributed by < cwl0429 > 實驗環境 $ gcc --version gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0 $ lscpu Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian
Oct 16, 2022安裝前置作業 檢查以下事項 gpu 型號 欲使用的 tensorflow 版本 確認 GPU 支援的 CUDA 版本 以 Geforce RTX 3060 為例,其 CUDA 至少要 8.6 以上
Aug 28, 2022or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up