2022-04-04
、 StevenChou499
1
利用上述 SIMD within a register (SWAR) 的技巧,我們可改寫為以下
memchr_opt
函式:
#include <stddef.h> #include <stdint.h> #include <limits.h> #include <string.h> /* Nonzero if either X or Y is not aligned on a "long" boundary */ #define UNALIGNED(X) ((long) X & (sizeof(long) - 1)) /* How many bytes are loaded each iteration of the word copy loop */ #define LBLOCKSIZE (sizeof(long)) /* Threshhold for punting to the bytewise iterator */ #define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE) #if LONG_MAX == 2147483647L #define DETECT_NULL(X) (((X) -0x01010101) & ~(X) & 0x80808080) #else #if LONG_MAX == 9223372036854775807L /* Nonzero if X (a long int) contains a NULL byte. */ #define DETECT_NULL(X) (((X) -0x0101010101010101) & ~(X) & 0x8080808080808080) #else #error long int is not a 32bit or 64bit type. #endif #endif /* @return nonzero if (long)X contains the byte used to fill MASK. */ #define DETECT_CHAR(X, MASK) (DETECT_NULL(X ^ MASK)) void *memchr_opt(const void *src_void, int c, size_t length) { const unsigned char *src = (const unsigned char *) src_void; unsigned char d = c; while (UNALIGNED(src)) { if (!length--) return NULL; if (*src == d) return (void *) src; src++; } if (!TOO_SMALL(length)) { /* If we get this far, we know that length is large and * src is word-aligned. */ /* The fast code reads the source one word at a time and only performs * the bytewise search on word-sized segments if they contain the search * character, which is detected by XORing the word-sized segment with a * word-sized block of the search character and then detecting for the * presence of NULL in the result. */ unsigned long *asrc = (unsigned long *) src; unsigned long mask = d << 8 | d; mask = mask << 16 | mask; for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1) mask = (mask << i) | mask; while (length >= LBLOCKSIZE) { /* XXXXX: Your implementation should appear here */ } /* If there are fewer than LBLOCKSIZE characters left, then we resort to * the bytewise loop. */ src = (unsigned char *) asrc; } while (length--) { if (*src == d) return (void *) src; src++; } return NULL; }請補完程式碼,使上述 memchr_opt 的實作符合 memchr 行為,作答規範:
- 列出
memchr_opt
函式完整程式碼,儘量撰寫程式註解XXXX
所在的 scope 應該利用DETECT_CHAR
巨集- 儘量以最精簡的程式碼撰寫
為了知道 XXXXX
需要填什麼,我們需要先了解程式碼的內容與用意。
#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))
UNALIGNED()
的作用是確認該字串的地址是否有以 long
的長度來做對齊,若有則會回傳 0 ,反之則會回傳其他整數。
#define LBLOCKSIZE (sizeof(long))
根據不同的 Data Model , long 所需的字節數也不同,因此需要以巨集來定義。
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
比較其大小是否超過 long 的長度。
#if LONG_MAX == 2147483647L
#define DETECT_NULL(X) (((X) -0x01010101) & ~(X) & 0x80808080)
#else
#if LONG_MAX == 9223372036854775807L
/* Nonzero if X (a long int) contains a NULL byte. */
#define DETECT_NULL(X) (((X) -0x0101010101010101) & ~(X) & 0x8080808080808080)
#else
#error long int is not a 32bit or 64bit type.
#endif
#endif
此處的用意為區分 32-bit 與 64-bit 在數字上表現的極限差異,並且其中需要的 DETECT_NULL()
也會因此需要調整。
#define DETECT_CHAR(X, MASK) (DETECT_NULL(X ^ MASK))
透過 DETECT_NULL()
去檢查是否擁有該字節,若沒有則會回傳 0 ,反之則不會。
const unsigned char *src = (const unsigned char *) src_void;
unsigned char d = c;
while (UNALIGNED(src)) {
if (!length--)
return NULL;
if (*src == d)
return (void *) src;
src++;
}
此處為判斷 src
是否有對齊成功,若有則不會進入,若無則會使用例題之方法檢查並回傳地址或是回傳 NULL
。
if (!TOO_SMALL(length)) {
/* If we get this far, we know that length is large and
* src is word-aligned.
*/
/* The fast code reads the source one word at a time and only performs
* the bytewise search on word-sized segments if they contain the search
* character, which is detected by XORing the word-sized segment with a
* word-sized block of the search character and then detecting for the
* presence of NULL in the result.
*/
unsigned long *asrc = (unsigned long *) src;
unsigned long mask = d << 8 | d;
mask = mask << 16 | mask;
for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
mask = (mask << i) | mask;
while (length >= LBLOCKSIZE) {
/* XXXXX: Your implementation should appear here */
}
/* If there are fewer than LBLOCKSIZE characters left, then we resort to
* the bytewise loop.
*/
src = (unsigned char *) asrc;
}
此處是先確認字傳長度夠長,接著定義一個指向 unsigned long
的 asrc
,並且使用想要找到的 d
製作一個 mask
,後面的 for
迴圈是因為會有不同的 Data Model 導致 long
所佔的 bytes 不一樣。
XXXXX
填答:在 while
迴圈中就是正式尋找字元 c
的範圍了。因為是需要依照 LBLOCKSIZE
的長度去做尋找,所以我們需要透過 DETECT_CHAR()
,再加上 mask
與 asrc
來尋找,但是由於 asrc
為一指標,所以需要加上 *
。若 DETECT_CHAR(mask, *asrc)
回傳非 0 值,代表在這 8 個 byte 中有找到所需字元 c
,這時必須停止尋找,並跳出迴圈,因此要加上 break
。反之若回傳 0 則代表還沒有找到,則需要跳往下 8 個 byte 繼續尋找,此時必須將 asrc++
便會一次跳躍 8 個 byte ,並且 length
要減去 LBLOCKSIZE
,這樣才不會已經檢查完了但確還沒有跳出迴圈。
#include <stddef.h>
#include <stdint.h>
#include <limits.h>
#include <string.h>
#include <stdio.h>
/* Nonzero if either X or Y is not aligned on a "long" boundary */
#define UNALIGNED(X) ((long) X & (sizeof(long) - 1))
/* How many bytes are loaded each iteration of the word copy loop */
#define LBLOCKSIZE (sizeof(long))
/* Threshhold for punting to the bytewise iterator */
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
#if LONG_MAX == 2147483647L // computer is 32-bit
#define DETECT_NULL(X) (((X) -0x01010101) & ~(X) & 0x80808080)
#else
#if LONG_MAX == 9223372036854775807L // computer is 64-bit
/* Nonzero if X (a long int) contains a NULL byte. */
#define DETECT_NULL(X) (((X) -0x0101010101010101) & ~(X) & 0x8080808080808080)
#else
#error long int is not a 32bit or 64bit type.
#endif
#endif
/* @return nonzero if (long)X contains the byte used to fill MASK. */
#define DETECT_CHAR(X, MASK) (DETECT_NULL(X ^ MASK))
void *memchr_opt(const void *src_void, int c, size_t length)
{
const unsigned char *src = (const unsigned char *) src_void;
unsigned char d = c;
while (UNALIGNED(src)) { // use normal way to find character
if (!length--)
return NULL;
if (*src == d)
return (void *) src;
src++;
}
if (!TOO_SMALL(length)) { // the length si long enough
/* If we get this far, we know that length is large and
* src is word-aligned.
*/
/* The fast code reads the source one word at a time and only performs
* the bytewise search on word-sized segments if they contain the search
* character, which is detected by XORing the word-sized segment with a
* word-sized block of the search character and then detecting for the
* presence of NULL in the result.
*/
unsigned long *asrc = (unsigned long *) src;
unsigned long mask = d << 8 | d; // create a mask full of character d
mask = mask << 16 | mask;
for (unsigned int i = 32; i < LBLOCKSIZE * 8; i <<= 1)
mask = (mask << i) | mask; // the mask is whether 4 or 8 bytes long
while (length >= LBLOCKSIZE) {
if(DETECT_CHAR(mask, *asrc)){ // return non-zero if detect d in *asrc
break; // if detected, exit the while loop
}
asrc++; // if not detected, jump to the next 8 bytes
length -= LBLOCKSIZE; // subtract length by LBLOCKSIZE
}
/* If there are fewer than LBLOCKSIZE characters left, then we resort to
* the bytewise loop.
*/
src = (unsigned char *) asrc;
}
while (length--) { // continue searching for character d
if (*src == d)
return (void *) src; // if found, return the address
src++; // if not found, jump to the next character
}
return NULL; // if all not found, return NULL
}
解答:
while (length >= LBLOCKSIZE) {
/* XXXXX: Your implementation should appear here */
if(DETECT_CHAR(mask, *asrc)){
break;
}
asrc++;
length -= LBLOCKSIZE;
}
2
contributed by < StevenChou499 > 進行 第 12 週測驗題第二題的延伸問題。過程中參照第 13 週測驗題第一/二題,嘗試開發出高效的 ring buffer。 相關資訊:[ ] ringbuffer 報告 [ ] Super Fast Circular Ring Buffer Using Virtual Memory trick [ ] Using black magic to make a fast circular buffer [ ] linear_ringbuffer/include/bev/linear_ringbuffer.hpp [ ] wuffs/script/mmap-ring-buffer.c
Feb 2, 2024ModelSim 在終端機使用方法 vlib <name> : 建立名為 name 的設計函式庫。 vlog <files> : 使用 ModelSim 編譯所有 files 的檔案。 vsim <top_level_module> : 使用 ModelSim 開啟編譯完成的檔案。 在 ModelSim 中輸入 run -all : 使用 vsim 開啟檔案後,輸入 run -all 可以執行模擬。 以下為範例 : PS C:\Users\StevenChou\Desktop\Verilog_practice\Chp5\Q3> vlib work ** Warning: (vlib-34) Library already exists at "work".
May 2, 2023第一個成功版本 程式碼 : `timescale 1ns/10ps module ATCONV( input clk, input reset, output reg busy, input ready,
May 1, 2023HDLBits 連結 : https://hdlbits.01xz.net/wiki/Main_Page
Apr 19, 2023or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up