fns
函式函式定義:
find N'th set bit in a word ,即在一 word 長度中的記憶體尋找第 n 個 set bits
函式作法:
__ffs
尋找低位開始第一個 set bit 之位置__clear_bit
清除該位 (設為 0) ,再回到 1.原程式碼:
static inline void __clear_bit(unsigned long nr, volatile unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
*p &= ~mask;
}
/* find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
while (word) {
unsigned int bit = __ffs(word);
if (n-- == 0)
return bit;
__clear_bit(bit, &word);
}
return BITS_PER_LONG;
}
改成:
需要注意的是,我加入了原先程式碼沒有的 macro (原本只到 __const_hweight8 ),只是為了美觀,如下:
#define __const_hweight2(w) \
((unsigned int) (!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))))
#define __const_hweight4(w) \
((unsigned int) (!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))) + \
(!!((w) & (1ULL << 2))) + (!!((w) & (1ULL << 3))))
/* find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
unsigned int sum = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
bits = __const_hweight32(word);
if (bits <= n) {
n -= bits;
word >>= 32;
sum += 32;
}
#endif
bits = __const_hweight16(word);
if (bits <= n) {
n -= bits;
word >>= 16;
sum += 16;
}
bits = __const_hweight8(word);
if (bits <= n) {
n -= bits;
word >>= 8;
sum += 8;
}
bits = __const_hweight4(word);
if (bits <= n) {
n -= bits;
word >>= 4;
sum += 4;
}
bits = __const_hweight2(word);
if (bits <= n) {
n -= bits;
word >>= 2;
sum += 2;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
word >>= 1;
sum += 1;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
sum += 1;
}
if(!n)
return sum;
return BITS_PER_LONG;
}
當 n>0
時,以下 block 為尋找 n
個 set bit
,不過事實上需找 n+1
個 set bit
,因此當 n=0
時,代表他還需要找一個,此時以下 block 則變為尋找全為 0 的 bits ,因此最後會抵達第 n+1
個 set bit
的右一位,但因為 sum
為從 0 開始往上加,因此會剛好等於 n-th set bit
的位置。
bits = __const_hweightC(word & const);
if (bits <= n) {
n -= bits;
sum += const;
}
假設需找 7-th set bit
word =
0x1010010101011011
8 7 6 5 4 32 10
以上為 set bit 的對應 index ,可以看到 7-th set bit 對應位置為 13 (0 起始)
而 __const_hweightC 會回傳範圍 C 有多少個 set bits
此時 n > 0
====block 為尋找 n 個 set bit=====
C = 16 回傳 9 > n => 不進入判斷式
C = 8 回傳 5 <= n => n-=5 sum+=8
word >>= 8
0x10100101
3 2 1 0
C = 4 回傳 2 <= n => n-=2 sum+=4
word >>= 4
0x1010
1 0
此時 n = 0
====block 為尋找全為 0 的 bits=====
C = 2 回傳 1 > n => 不進入判斷式
C = 1 回傳 0 <= n => n-=0 sum+=1
word >>= 1
0x101
1 0
C = 1 回傳 1 > n => 不進入判斷式
0x101
1 0
結束
n=0 sum=13
可以看到此時 word
的 lsb
為 n-th set bit
,且右移了 13 位,即為 n-th set bit
的 index
比較:
原式: n+1
次 ffs
, n
次 clear bit
,而 ffs
使用 log(BITS_PER_LONG)
次判斷式,而 clear bit
為常數操作,因此最後會有 (n+1)
*log(BITS_PER_LONG)
+ (n + 1)(外部 while 迴圈)
次判斷式。
我的版本: log(BITS_PER_LONG) + 1 (再做一次 & 0x1 去完整檢查 64 bit 的範圍)
+ 1 次判斷式
因此 n 越大,差距應會越明顯
以下為測試程式碼:
可在這裡直接執行 OnlineGDB
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
/* Assume 64-bit architecture */
#define BITS_PER_LONG 64
#define ARRAY_SIZE 1000000
#define BITS_PER_BYTE 8
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define DIV_ROUND_UP(n, d) (((n) + (d) -1) / (d))
#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
#define __const_hweight8(w) \
((unsigned int) ((!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))) + \
(!!((w) & (1ULL << 2))) + (!!((w) & (1ULL << 3))) + \
(!!((w) & (1ULL << 4))) + (!!((w) & (1ULL << 5))) + \
(!!((w) & (1ULL << 6))) + (!!((w) & (1ULL << 7)))))
#define __const_hweight16(w) (__const_hweight8(w) + __const_hweight8((w) >> 8))
#define __const_hweight32(w) \
(__const_hweight16(w) + __const_hweight16((w) >> 16))
#define __const_hweight64(w) \
(__const_hweight32(w) + __const_hweight32((w) >> 32))
static inline unsigned long hweight_long(unsigned long w)
{
return __const_hweight64(w);
}
/* find first bit in word.
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsigned long word)
{
int num = 0;
#if BITS_PER_LONG == 64
if (!(word & 0xffffffff)) {
num += 32;
word >>= 32;
}
#endif
if (!(word & 0xffff)) {
num += 16;
word >>= 16;
}
if (!(word & 0xff)) {
num += 8;
word >>= 8;
}
if (!(word & 0xf)) {
num += 4;
word >>= 4;
}
if (!(word & 0x3)) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
}
static inline void __clear_bit(unsigned long nr, volatile unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
*p &= ~mask;
}
/* find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
while (word) {
unsigned int bit = __ffs(word);
if (n-- == 0)
return bit;
__clear_bit(bit, &word);
}
return BITS_PER_LONG;
}
#define __const_hweight2(w) \
((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))))
#define __const_hweight4(w) \
((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))) + \
(!!((word) & (1ULL << 2))) + (!!((word) & (1ULL << 3))))
/* find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
*/
static inline unsigned long fns2(unsigned long word, unsigned int n)
{
unsigned long sum = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
bits = __const_hweight32(word & 0xffffffff);
if (bits <= n) {
n -= bits;
word >>= 32;
sum += 32;
}
#endif
bits = __const_hweight16(word & 0xffff);
if (bits <= n) {
n -= bits;
word >>= 16;
sum += 16;
}
bits = __const_hweight8(word & 0xff);
if (bits <= n) {
n -= bits;
word >>= 8;
sum += 8;
}
bits = __const_hweight4(word & 0xf);
if (bits <= n) {
n -= bits;
word >>= 4;
sum += 4;
}
bits = __const_hweight2(word & 0x3);
if (bits <= n) {
n -= bits;
word >>= 2;
sum += 2;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
word >>= 1;
sum += 1;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
sum += 1;
}
if(!n)
return sum;
return BITS_PER_LONG;
}
struct cycletime
{
unsigned long cycle;
double time;
};
int main()
{
unsigned long limit = 2000000000;
struct cycletime avgrec1 = {.cycle = 0, .time = 0.0};
struct cycletime avgrec2 = {.cycle = 0, .time = 0.0};
unsigned long test_t = 200;
for(unsigned long idx = 0; idx < 64; idx++){
for(unsigned long a = 0; a < test_t; a++){
// Allocate memory for the array
unsigned long *bit_array = malloc(BITS_TO_LONGS(ARRAY_SIZE) * sizeof(unsigned long));
if (!bit_array) {
perror("Memory allocation failed");
return EXIT_FAILURE;
}
// Initialize random seed
srand(a);
unsigned long t = 0;
// Set random bits
for (unsigned long i = 0; i < ARRAY_SIZE; ++i) {
unsigned long word_idx = BIT_WORD(i);
unsigned long bit_idx = i & (BITS_PER_LONG - 1);
if (rand() % 2) {
bit_array[word_idx] |= (1UL << bit_idx);
}
else
bit_array[word_idx] ^= (bit_array[word_idx] & (1UL << bit_idx));
}
unsigned long bound = BIT_WORD(ARRAY_SIZE);
unsigned long r1[bound];
unsigned long r2[bound];
clock_t start, end;
unsigned long long cycles_start, cycles_end;
start = clock();
asm volatile("rdtsc" : "=A" (cycles_start));
for (unsigned long i = 0; i < bound; ++i) {
r1[i] = fns(bit_array[i], idx);
}
asm volatile("rdtsc" : "=A" (cycles_end));
end = clock();
avgrec1.time += (double)(end - start) / CLOCKS_PER_SEC;
if((cycles_end - cycles_start) < limit)
avgrec1.cycle += (cycles_end - cycles_start);
// printf("original ver.=========================================================================================\n");
// printf("time : %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
// printf("CPU cycles : %llu cycles\n", cycles_end - cycles_start);
start = clock();
asm volatile("rdtsc" : "=A" (cycles_start));
for (unsigned long i = 0; i < bound; ++i) {
r2[i] = fns2(bit_array[i], idx);
}
asm volatile("rdtsc" : "=A" (cycles_end));
end = clock();
avgrec2.time += (double)(end - start) / CLOCKS_PER_SEC;
if((cycles_end - cycles_start) < limit)
avgrec2.cycle += (cycles_end - cycles_start);
// printf("new ver.==============================================================================================\n");
// printf("time : %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
// printf("CPU cycles : %llu cycles\n", cycles_end - cycles_start);
for (unsigned long i = 0; i < bound; ++i) {
if(r2[i] != r1[i]){
printf("Different in %ld, %lu vs %lu\n", i, r1[i], r2[i]);
}
}
free(bit_array);
}
// printf("original ver.=========================================================================================\n");
// printf("Average time : %f seconds\n", avgrec1.time/(test_t*idx));
// printf("Average CPU cycles : %lu cycles\n", avgrec1.cycle/(test_t*idx));
// printf("new ver.==============================================================================================\n");
// printf("Average time : %f seconds\n", avgrec2.time/(test_t*idx));
// printf("Average CPU cycles : %lu cycles\n", avgrec2.cycle/(test_t*idx));
}
printf("original ver.=========================================================================================\n");
printf("Average time : %f seconds\n", avgrec1.time/(test_t*64));
printf("Average CPU cycles : %lu cycles\n", avgrec1.cycle/(test_t*64));
printf("new ver.==============================================================================================\n");
printf("Average time : %f seconds\n", avgrec2.time/(test_t*64));
printf("Average CPU cycles : %lu cycles\n", avgrec2.cycle/(test_t*64));
return EXIT_SUCCESS;
}
其中:
test_t
為測試輪數,即尋找 n-th set bit
多少次
idx
為 n-th set bit
ARRAY_SIZE
為一連續記憶體空間,因此每一輪中皆會送入 1000000/64
個 long
長度的 word
去尋找 n-th set bit
測試過程:
1000000
的記憶體空間並隨機填入 1
或 0
long
(這裡為64
bits) 長度的記憶體位置傳入 fns
作計算,同時為了公平,兩比較函式的每一輪 (1000000/64
) 輸入都相同,並為了更好找到平均值,每一次尋找的 set bits 皆尋找 200 次,而 set bits 為能更全面,範圍為 0~63 。1000000/64
)*64
*200
次皆為尋找 長度 1000000 的記憶體位置 每一個 word 內的 n-th set bit
測量 32 bit
original ver.============================
Average time : 0.005153 seconds
Average CPU cycles : 18478354 cycles
new ver.=================================
Average time : 0.000790 seconds
Average CPU cycles : 2843377 cycles
測量 64 bit
original ver.============================
Average time : 0.007763 seconds
Average CPU cycles : 27781828 cycles
new ver.=================================
Average time : 0.000729 seconds
Average CPU cycles : 2620535 cycles
最後輸出正確
可以看到我改寫的版本在平均花費時間 (6.52x faster、10.64x faster) 與 cycle 數 (6.5x less、10.60x less) 皆比 linux/include/linux/bitops.h 裡的 fns
還要更佳
且也證明了 n
越大,差距越大,這裡因為是做 0 ~ 31
及 0 ~ 63
,所以會被平均,不過仍可以看到 n
增大會使平均時間增加。
而 CPU cycles 會降低我認為是 branch miss 次數降低,隨著 n
加大,代表需要找的 set bits 更多,在我的版本中會更容易進入判斷式 (只要 bits <= n
,因為 n
加大) ,不過 Linux 核心內的版本就沒那麼幸運,因為他只是要找最低位的 set bit 位置,而與 n
的設定沒有關係,因此 branch prediction 很可能猜錯,透過結果也可以猜出 branch 大致上以預測為進入判斷式為主。
我的程式碼的優勢:
因為並沒有實驗 branch prediction miss 的次數,因此不提到。
perf
做比較目前可以看到 branch-misses 是不支援的,原因似乎出自 CPU,等待我在其他裝置上嘗試
Performance counter stats for './fns':
<not supported> cycles
326980.52 msec cpu-clock # 1.000 CPUs utilized
326949836000 ns user_time # 999.906 M/sec
7998000 ns system_time # 24.460 K/sec
326981.93 msec task-clock # 1.000 CPUs utilized
327019524641 ns duration_time # 1.000 G/sec
<not supported> instructions
<not supported> branches
<not supported> branch-misses
327.019524641 seconds time elapsed
326.949836000 seconds user
0.007998000 seconds sys
使用 $ perf record -e cycles ./fns
Samples: 1M of event 'cpu-clock:uH', Event count (approx.): 393692500000
Overhead Com Shared Object Symbol
31.54% fns fns [.] main
27.05% fns fns [.] __ffs
10.55% fns libc.so.6 [.] __random
8.62% fns libc.so.6 [.] __random_r
3.98% fns libc.so.6 [.] __aarch64_swp4_rel
3.72% fns libc.so.6 [.] __aarch64_cas4_acq
3.52% fns fns [.] fns2
3.21% fns fns [.] fns
2.89% fns fns [.] rand@plt
2.53% fns fns [.] __clear_bit
可以看到我的版本 fns2
較使用 __ffs
及 __clear_bit
的 fns
,使用更少的 cpu-clock
原圖來看,原始 fns branch prediction miss 的次數佔比非常高
而我的版本在佔比最高的地方也並非是 branch prediction
起初並未知道此問題,因此僅以 $ gcc -c fns.c -o fns.o
進行編譯
補充實驗
使用以下編譯指令,並使用虛擬機器 ubuntu 22.04 執行
$ gcc -c -O2 fns.c -o fns.o
$ gcc fns.o -o fns
$ ./fns
original ver.==========================================
Average time : 0.002114 seconds
Average time CPU cycles : 7620518 cycles
new ver.===============================================
Average time : 0.000349 seconds
Average time CPU cycles : 1267415 cycles
可以看到效能差異減小至平均花費時間 (6.05x faster) 與 cycle 數 (6.01x less) ,為原始數據約 60% ,不過還是表現較佳。
改以原生 Linux 環境執行
original ver.==========================================
Average time : 0.002162 seconds
Average time CPU cycles : 4301711 cycles
new ver.===============================================
Average time : 0.000343 seconds
Average time CPU cycles : 682152 cycles
可以看到效能差異不大但還是有提升 (6.30x faster) 與 cycle 數 (6.30x less) ,不過 cycle 數大幅減少。
想問是指 ffs 有分為軟體實作,如下
static inline unsigned long __ffs(unsigned long word)
{
int num = 0;
#if BITS_PER_LONG == 64
if (!(word & 0xffffffff)) {
num += 32;
word >>= 32;
}
#endif
...
}
及硬體實作,如下:
static inline unsigned long __ffs(unsigned long x)
{
int ret;
__asm__ ("l.ff1 %0,%1"
: "=r" (ret)
: "r" (x));
return ret-1;
}
而硬體實作是以 inline assembly 實作,需要比較我的版本與硬體實作版本的差異嗎?
目前使用虛擬機器及原生 Linux 環境皆會出現以下錯誤
fns.c: Assembler messages:
fns.c:58: Error: no such instruction: `l.ff1 %ecx,%ecx'
fns.c:58: Error: no such instruction: `l.ff1 %ecx,%ecx'
然而我無論如何都查不到 l.ff1 %0,%1
的 assembly instruction。
以下為 __ffs
的 inline,
#ifdef CONFIG_OPENRISC_HAVE_INST_FF1
static inline unsigned long __ffs(unsigned long x)
{
int ret;
__asm__ ("l.ff1 %0,%1"
: "=r" (ret)
: "r" (x));
return ret-1;
}
#else
#include <asm-generic/bitops/__ffs.h>
#endif
不過就找不到定義 CONFIG_OPENRISC_HAVE_INST_FF1
的地方了
若 __ffs
為使用 builtin funtion ,那麼結果反而會較差,我後來認為是即使判斷式較少,但因計算 1 的數量的程式碼為不斷地將 1 左移並做 & 運算,因此 cycle 數反而超過了少量判斷式省下來的 cycle 數而提升。因此邱冠維同學提出了更好的方法,經過與 linux 核心開發者及另一位 bitmap 的 reviewer 討論過後,目前可能的程式碼為
static inline unsigned long fns(unsigned long word, unsigned int n)
{
while (word && n--)
word &= word - 1;
return word ? __ffs(word) : BITS_PER_LONG;
}
此方法利用了 Brian Kernighan’s Algorithm
,可以僅做一次 word &= word - 1
就達成原程式碼的 clear bit ,假如一 word 為 0b1000000
,那麼僅須一次 word &= word - 1
即可以等同清除一位 set bit 而無須先透過 __ffs
計算最低位 set bit 並作清除的動作。但我的方法須使用以下程式碼做計算並得出 一定範圍內的 set bit ,而且會有重複計算相同範圍的問題(假設 32 位內的 set bit 過多,那麼不會進入判斷式,且在下一步會計算 16 位內的 set bit ,可以發現其實是會重複算最後的範圍)
#define __const_hweight8(w) \
((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))) + \
(!!((word) & (1ULL << 2))) + (!!((word) & (1ULL << 3)))) + \
(!!((word) & (1ULL << 4))) + (!!((word) & (1ULL << 5)))) + \
(!!((word) & (1ULL << 6))) + (!!((word) & (1ULL << 7))))
想當然前者會更為高效,不過當 n 越大時 (且要有足夠的 n ),會有相反的結果,以下為結果,其中 original 為邱同學的程式碼,而 new 則為我的:
Test n: 0
original ver.==========================================
Average time : 0.000021 seconds
Average time CPU cycles : 68181 cycles
new ver.===============================================
Average time : 0.000201 seconds
Average time CPU cycles : 720986 cycles
Test n: 1
original ver.==========================================
Average time : 0.000023 seconds
Average time CPU cycles : 75810 cycles
new ver.===============================================
Average time : 0.000238 seconds
Average time CPU cycles : 855627 cycles
Test n: 2
original ver.==========================================
Average time : 0.000026 seconds
Average time CPU cycles : 87842 cycles
new ver.===============================================
Average time : 0.000254 seconds
Average time CPU cycles : 913202 cycles
Test n: 3
original ver.==========================================
Average time : 0.000031 seconds
Average time CPU cycles : 103377 cycles
new ver.===============================================
Average time : 0.000275 seconds
Average time CPU cycles : 987046 cycles
Test n: 4
original ver.==========================================
Average time : 0.000032 seconds
Average time CPU cycles : 110543 cycles
new ver.===============================================
Average time : 0.000274 seconds
Average time CPU cycles : 983954 cycles
Test n: 5
original ver.==========================================
Average time : 0.000042 seconds
Average time CPU cycles : 144670 cycles
new ver.===============================================
Average time : 0.000280 seconds
Average time CPU cycles : 1004897 cycles
Test n: 6
original ver.==========================================
Average time : 0.000041 seconds
Average time CPU cycles : 141425 cycles
new ver.===============================================
Average time : 0.000295 seconds
Average time CPU cycles : 1061731 cycles
Test n: 7
original ver.==========================================
Average time : 0.000046 seconds
Average time CPU cycles : 161445 cycles
new ver.===============================================
Average time : 0.000308 seconds
Average time CPU cycles : 1107051 cycles
Test n: 8
original ver.==========================================
Average time : 0.000052 seconds
Average time CPU cycles : 179576 cycles
new ver.===============================================
Average time : 0.000313 seconds
Average time CPU cycles : 1121208 cycles
Test n: 9
original ver.==========================================
Average time : 0.000055 seconds
Average time CPU cycles : 193132 cycles
new ver.===============================================
Average time : 0.000315 seconds
Average time CPU cycles : 1130842 cycles
Test n: 10
original ver.==========================================
Average time : 0.000060 seconds
Average time CPU cycles : 210818 cycles
new ver.===============================================
Average time : 0.000324 seconds
Average time CPU cycles : 1164863 cycles
Test n: 11
original ver.==========================================
Average time : 0.000065 seconds
Average time CPU cycles : 228608 cycles
new ver.===============================================
Average time : 0.000336 seconds
Average time CPU cycles : 1206329 cycles
Test n: 12
original ver.==========================================
Average time : 0.000071 seconds
Average time CPU cycles : 251015 cycles
new ver.===============================================
Average time : 0.000350 seconds
Average time CPU cycles : 1257303 cycles
Test n: 13
original ver.==========================================
Average time : 0.000076 seconds
Average time CPU cycles : 266143 cycles
new ver.===============================================
Average time : 0.000353 seconds
Average time CPU cycles : 1267077 cycles
Test n: 14
original ver.==========================================
Average time : 0.000081 seconds
Average time CPU cycles : 284067 cycles
new ver.===============================================
Average time : 0.000362 seconds
Average time CPU cycles : 1298368 cycles
Test n: 15
original ver.==========================================
Average time : 0.000087 seconds
Average time CPU cycles : 306249 cycles
new ver.===============================================
Average time : 0.000376 seconds
Average time CPU cycles : 1351340 cycles
Test n: 16
original ver.==========================================
Average time : 0.000093 seconds
Average time CPU cycles : 328772 cycles
new ver.===============================================
Average time : 0.000378 seconds
Average time CPU cycles : 1359326 cycles
Test n: 17
original ver.==========================================
Average time : 0.000098 seconds
Average time CPU cycles : 345717 cycles
new ver.===============================================
Average time : 0.000370 seconds
Average time CPU cycles : 1329794 cycles
Test n: 18
original ver.==========================================
Average time : 0.000106 seconds
Average time CPU cycles : 373836 cycles
new ver.===============================================
Average time : 0.000380 seconds
Average time CPU cycles : 1363232 cycles
Test n: 19
original ver.==========================================
Average time : 0.000109 seconds
Average time CPU cycles : 385798 cycles
new ver.===============================================
Average time : 0.000382 seconds
Average time CPU cycles : 1372295 cycles
Test n: 20
original ver.==========================================
Average time : 0.000114 seconds
Average time CPU cycles : 404843 cycles
new ver.===============================================
Average time : 0.000382 seconds
Average time CPU cycles : 1371155 cycles
Test n: 21
original ver.==========================================
Average time : 0.000124 seconds
Average time CPU cycles : 440045 cycles
new ver.===============================================
Average time : 0.000397 seconds
Average time CPU cycles : 1426729 cycles
Test n: 22
original ver.==========================================
Average time : 0.000128 seconds
Average time CPU cycles : 453378 cycles
new ver.===============================================
Average time : 0.000395 seconds
Average time CPU cycles : 1419189 cycles
Test n: 23
original ver.==========================================
Average time : 0.000134 seconds
Average time CPU cycles : 475297 cycles
new ver.===============================================
Average time : 0.000416 seconds
Average time CPU cycles : 1493521 cycles
Test n: 24
original ver.==========================================
Average time : 0.000144 seconds
Average time CPU cycles : 509954 cycles
new ver.===============================================
Average time : 0.000421 seconds
Average time CPU cycles : 1511201 cycles
Test n: 25
original ver.==========================================
Average time : 0.000151 seconds
Average time CPU cycles : 536158 cycles
new ver.===============================================
Average time : 0.000419 seconds
Average time CPU cycles : 1491608 cycles
Test n: 26
original ver.==========================================
Average time : 0.000166 seconds
Average time CPU cycles : 591555 cycles
new ver.===============================================
Average time : 0.000429 seconds
Average time CPU cycles : 1539124 cycles
Test n: 27
original ver.==========================================
Average time : 0.000176 seconds
Average time CPU cycles : 627527 cycles
new ver.===============================================
Average time : 0.000424 seconds
Average time CPU cycles : 1523694 cycles
Test n: 28
original ver.==========================================
Average time : 0.000193 seconds
Average time CPU cycles : 687868 cycles
new ver.===============================================
Average time : 0.000417 seconds
Average time CPU cycles : 1496883 cycles
Test n: 29
original ver.==========================================
Average time : 0.000215 seconds
Average time CPU cycles : 768329 cycles
new ver.===============================================
Average time : 0.000410 seconds
Average time CPU cycles : 1473115 cycles
Test n: 30
original ver.==========================================
Average time : 0.000250 seconds
Average time CPU cycles : 891316 cycles
new ver.===============================================
Average time : 0.000407 seconds
Average time CPU cycles : 1460716 cycles
Test n: 31
original ver.==========================================
Average time : 0.000269 seconds
Average time CPU cycles : 963115 cycles
new ver.===============================================
Average time : 0.000387 seconds
Average time CPU cycles : 1389422 cycles
Test n: 32
original ver.==========================================
Average time : 0.000289 seconds
Average time CPU cycles : 1032171 cycles
new ver.===============================================
Average time : 0.000370 seconds
Average time CPU cycles : 1329986 cycles
Test n: 33
original ver.==========================================
Average time : 0.000305 seconds
Average time CPU cycles : 1091329 cycles
new ver.===============================================
Average time : 0.000353 seconds
Average time CPU cycles : 1268201 cycles
Test n: 34
original ver.==========================================
Average time : 0.000325 seconds
Average time CPU cycles : 1163132 cycles
new ver.===============================================
Average time : 0.000338 seconds
Average time CPU cycles : 1213680 cycles
Test n: 35
original ver.==========================================
Average time : 0.000346 seconds
Average time CPU cycles : 1239294 cycles
new ver.===============================================
Average time : 0.000321 seconds
Average time CPU cycles : 1153416 cycles
Test n: 36
original ver.==========================================
Average time : 0.000356 seconds
Average time CPU cycles : 1274277 cycles
new ver.===============================================
Average time : 0.000308 seconds
Average time CPU cycles : 1106007 cycles
Test n: 37
original ver.==========================================
Average time : 0.000362 seconds
Average time CPU cycles : 1298737 cycles
new ver.===============================================
Average time : 0.000295 seconds
Average time CPU cycles : 1060095 cycles
Test n: 38
original ver.==========================================
Average time : 0.000366 seconds
Average time CPU cycles : 1311189 cycles
new ver.===============================================
Average time : 0.000284 seconds
Average time CPU cycles : 1020473 cycles
Test n: 39
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1324439 cycles
new ver.===============================================
Average time : 0.000281 seconds
Average time CPU cycles : 1009790 cycles
Test n: 40
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1340945 cycles
new ver.===============================================
Average time : 0.000293 seconds
Average time CPU cycles : 1050948 cycles
Test n: 41
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1334657 cycles
new ver.===============================================
Average time : 0.000279 seconds
Average time CPU cycles : 1002511 cycles
Test n: 42
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1340435 cycles
new ver.===============================================
Average time : 0.000280 seconds
Average time CPU cycles : 1005596 cycles
Test n: 43
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1324166 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 992143 cycles
Test n: 44
original ver.==========================================
Average time : 0.000369 seconds
Average time CPU cycles : 1322804 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 1002429 cycles
Test n: 45
original ver.==========================================
Average time : 0.000375 seconds
Average time CPU cycles : 1340341 cycles
new ver.===============================================
Average time : 0.000283 seconds
Average time CPU cycles : 1015056 cycles
Test n: 46
original ver.==========================================
Average time : 0.000367 seconds
Average time CPU cycles : 1313704 cycles
new ver.===============================================
Average time : 0.000275 seconds
Average time CPU cycles : 996220 cycles
Test n: 47
original ver.==========================================
Average time : 0.000368 seconds
Average time CPU cycles : 1317041 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 994093 cycles
Test n: 48
original ver.==========================================
Average time : 0.000365 seconds
Average time CPU cycles : 1309510 cycles
new ver.===============================================
Average time : 0.000273 seconds
Average time CPU cycles : 980609 cycles
Test n: 49
original ver.==========================================
Average time : 0.000368 seconds
Average time CPU cycles : 1318358 cycles
new ver.===============================================
Average time : 0.000274 seconds
Average time CPU cycles : 984556 cycles
Test n: 50
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1326410 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 982680 cycles
Test n: 51
original ver.==========================================
Average time : 0.000366 seconds
Average time CPU cycles : 1312605 cycles
new ver.===============================================
Average time : 0.000273 seconds
Average time CPU cycles : 978370 cycles
Test n: 52
original ver.==========================================
Average time : 0.000368 seconds
Average time CPU cycles : 1316405 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 995060 cycles
Test n: 53
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1324702 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 994500 cycles
Test n: 54
original ver.==========================================
Average time : 0.000372 seconds
Average time CPU cycles : 1333373 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 990131 cycles
Test n: 55
original ver.==========================================
Average time : 0.000369 seconds
Average time CPU cycles : 1323664 cycles
new ver.===============================================
Average time : 0.000281 seconds
Average time CPU cycles : 1007539 cycles
Test n: 56
original ver.==========================================
Average time : 0.000375 seconds
Average time CPU cycles : 1340914 cycles
new ver.==============================================
Average time : 0.000278 seconds
Average time CPU cycles : 996436 cycles
Test n: 57
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1338532 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 994950 cycles
Test n: 58
original ver.==========================================
Average time : 0.000376 seconds
Average time CPU cycles : 1344455 cycles
new ver.===============================================
Average time : 0.000278 seconds
Average time CPU cycles : 997226 cycles
Test n: 59
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1334166 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 979184 cycles
Test n: 60
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1336970 cycles
new ver.===============================================
Average time : 0.000280 seconds
Average time CPU cycles : 1006646 cycles
Test n: 61
original ver.==========================================
Average time : 0.000375 seconds
Average time CPU cycles : 1341625 cycles
new ver.===============================================
Average time : 0.000285 seconds
Average time CPU cycles : 1026516 cycles
Test n: 62
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1333731 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 993799 cycles
Test n: 63
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1340000 cycles
new ver.===============================================
Average time : 0.000286 seconds
Average time CPU cycles : 1025193 cycles
經由以上實驗,可以看到在 n 為 35 時,我的程式碼會開始達到較好的效能,不過因為我的程式碼計算過程非常固定,因此可以達到較為常數時間的執行時間,不會有太大幅度的變動,但邱同學的程式碼會與 n 的值成正比,最後會緩和則是因為我是採以隨機填入 1 或 0 的方式, set bits 的期望值應會約為 32 ,因此邱同學的版本花費時間未再上升。
雖然我的程式碼在 n 偏大時表現較佳,不過與版本相比在 n 篇小時會較差是最大的問題。
透過這次 Jserv 老師與邱同學的討論對我有莫大的幫助,包括改進需做嚴謹的實驗是必須的,且參考網路上現有的程式碼是必要的,雖然之前在課堂作業有看過 Brian Kernighan’s Algorithm 的使用,此次改進時卻完全沒想到,但其實一上網搜尋 count set bits 就可以看到這個演算法,我甚至認為少量的判斷式就是最佳的,不過透過結果可以看到這是不成立的。這次我也深知自己能力的不足,學到如何向他人表達你的程式碼之所以更好,並能與他人進行有效率的溝通。
最後謝謝 Jserv 及 邱冠維同學
unsigned int lut[16][4] = {{4, 4, 4, 4}, {16, 4, 4, 4}, {17, 4, 4, 4}, {32, 1, 4, 4}, \
{18, 4, 4, 4}, {32, 2, 4, 4}, {33, 2, 4, 4}, {48, 1, 2, 4}, \
{19, 4, 4, 4}, {32, 3, 4, 4}, {33, 3, 4, 4}, {48, 1, 3, 4}, \
{34, 3, 4, 4}, {48, 2, 3, 4}, {49, 2, 3, 4}, {64, 1, 2, 3}};
static inline unsigned int hweight4(uint8_t word)
{
return lut[word & 0xf][0] >> 4;
}
static inline unsigned long fns4(uint8_t word, unsigned int n)
{
return lut[word & 0xf][n] & 0xf;
}
static inline unsigned long fns8(uint8_t word, unsigned int n)
{
unsigned int w = hweight4(word);
return n < w ? fns4(word, n) : 4 + fns4(word >> 4, n - w);
}
static inline unsigned long fns16(uint16_t word, unsigned int n)
{
unsigned int w = __const_hweight8((uint8_t)word);
return n < w ? fns8((uint8_t)word, n) : 8 + fns8((uint8_t)(word >> 8), n - w);
}
static inline unsigned long fns32(uint32_t word, unsigned int n)
{
unsigned int w = __const_hweight16((uint16_t)word);
return n < w ? fns16((uint16_t)word, n) : 16 + fns16((uint16_t)(word >> 16), n - w);
}
static inline unsigned long fns64(uint64_t word, unsigned int n)
{
unsigned int w = __const_hweight32((uint32_t)word);
return n < w ? fns32((uint32_t)word, n) : 32 + fns32((uint32_t)(word >> 32), n - w);
}
/**
* fns - find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
* @n: Bit to find, counting from 0
*
* Returns N'th set bit. If no such bit found, returns >= BITS_PER_LONG
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
#if BITS_PER_LONG == 64
return fns64(word, n);
#else
return fns32(word, n);
#endif
}
Lookup table 版本1 不使用前面就計算過的版本
satic inline unsigned long fns(unsigned long word, unsigned int n)
{
unsigned int pos = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
// find 32 bits
bits = __const_hweight32(word);
if (bits <= n) {
n -= bits;
word >>= 32;
pos += 32;
}
#endif
bits = __const_hweight16(word);
if (bits <= n) {
n -= bits;
word >>= 16;
pos += 16;
}
bits = __const_hweight8(word);
if (bits <= n) {
n -= bits;
word >>= 8;
pos += 8;
}
return n<8? fbs_table[word & 0xff][n] + pos: BITS_PER_LONG+1;
}
Lookup table 版本2 使用前面計算過的版本
static inline unsigned long fns(unsigned long word, unsigned int n)
{
unsigned int pos = 0;
unsigned int bits_higher_higher = 0;
unsigned int bits_higher = 0;
unsigned int bits_lower = 0;
unsigned int bits_lower_lower = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
// find 32 bits
bits_higher_higher = __const_hweight8(word>>24);
bits_higher = __const_hweight8(word>>16);
bits_lower = __const_hweight8(word>>8);
bits_lower_lower = __const_hweight8(word);
bits = bits_higher_higher+bits_higher+bits_lower+bits_lower_lower;
if (bits <= n)
n -= bits;
word >>= 32;
pos += 32;
// update 16 bits
bits_lower = __const_hweight8(word>>8);
bits_lower_lower = __const_hweight8(word);
}
#else
// find 16 bits
bits_lower = __const_hweigh8(word>>8);
bits_lower_lower = __const_hweight8(word);
#endif
bits = bits_lower+bits_lower_lower;
if (bits <= n) {
n -= bits;
word >>= 16;
pos += 16;
bits_lower_lower = __const_hweight8(word);
}
bits = bits_lower_lower;
if (bits <= n) {
n -= bits;
word >>= 8;
pos += 8;
}
return n<8? fbs_table[word & 0xff][n] + pos: BITS_PER_LONG+1;
}
binary search 版本 1 不使用前面就計算過的版本
static inline unsigned long fns(unsigned long word, unsigned int n)
{
unsigned int sum = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
bits = __const_hweight32(word);
if (bits <= n) {
n -= bits;
word >>= 32;
sum += 32;
}
#endif
bits = __const_hweight16(word);
if (bits <= n) {
n -= bits;
word >>= 16;
sum += 16;
}
bits = __const_hweight8(word);
if (bits <= n) {
n -= bits;
word >>= 8;
sum += 8;
}
bits = __const_hweight4(word);
if (bits <= n) {
n -= bits;
word >>= 4;
sum += 4;
}
bits = __const_hweight2(word);
if (bits <= n) {
n -= bits;
word >>= 2;
sum += 2;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
word >>= 1;
sum += 1;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
sum += 1;
}
if(!n)
return sum;
return BITS_PER_LONG+1;
}
binary search 版本 2 使用前面計算過的版本
static inline unsigned long fns(unsigned long word, unsigned int n)
{
unsigned int sum = 0;
unsigned int bits_higher = 0;
unsigned int bits_lower = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
// find 32 bits
bits_higher = __const_hweight16(word>>16);
bits_lower = __const_hweight16(word);
bits = bits_higher+bits_lower;
if (bits <= n) {
n -= bits;
word >>= 32;
sum += 32;
// update 16 bits
bits_lower = __const_hweight16(word);
}
#else
// find 16 bits
bits_lower = __const_hweight16(word)
#endif
if (bits_lower <= n) {
n -= bits_lower;
word >>= 16;
sum += 16;
}
// find 8 bits
bits_higher = __const_hweight4(word>>4);
bits_lower = __const_hweight4(word);
bits = bits_higher+bits_lower;
if (bits <= n) {
n -= bits;
word >>= 8;
sum += 8;
// update 4 bits
bits_lower = __const_hweight4(word);
}
if (bits_lower <= n) {
n -= bits_lower;
word >>= 4;
sum += 4;
}
// find 2 bits
bits_higher = (unsigned int) ((word>>1) & 0x1); // 0
bits_lower = (unsigned int) (word & 0x1); // 1
bits = bits_higher+bits_lower; // 1
if (bits <= n) {
n -= bits;
word >>= 2;
sum += 2;
// update 1 bit
bits_lower = (unsigned int) (word & 0x1);
}
if (bits_lower <= n) {
n -= bits_lower;
word >>= 1;
sum += 1;
// update 1 bit
bits_lower = (unsigned int) (word & 0x1);
}
if (bits_lower <= n) {
n -= bits_lower;
sum += 1;
}
if(!n)
return sum;
return BITS_PER_LONG+1;
}