# FNS 改進 ## `fns` 函式 **函式定義:** find N'th set bit in a word ,即在一 word 長度中的記憶體尋找第 n 個 set bits **函式作法:** 1. 利用 `__ffs` 尋找低位開始第一個 set bit 之位置 2. 若此時的 n 不為 0 ,則利用 `__clear_bit` 清除該位 (設為 0) ,再回到 1. 原程式碼: ```c static inline void __clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr); *p &= ~mask; } /* find N'th set bit in a word * @word: The word to search * @n: Bit to find */ static inline unsigned long fns(unsigned long word, unsigned int n) { while (word) { unsigned int bit = __ffs(word); if (n-- == 0) return bit; __clear_bit(bit, &word); } return BITS_PER_LONG; } ``` 改成: 需要注意的是,我加入了原先程式碼沒有的 macro (原本只到 __const_hweight8 ),只是為了美觀,如下: ```c #define __const_hweight2(w) \ ((unsigned int) (!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1)))) #define __const_hweight4(w) \ ((unsigned int) (!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))) + \ (!!((w) & (1ULL << 2))) + (!!((w) & (1ULL << 3)))) ``` ```c /* find N'th set bit in a word * @word: The word to search * @n: Bit to find */ static inline unsigned long fns(unsigned long word, unsigned int n) { unsigned int sum = 0; unsigned int bits = 0; #if BITS_PER_LONG == 64 bits = __const_hweight32(word); if (bits <= n) { n -= bits; word >>= 32; sum += 32; } #endif bits = __const_hweight16(word); if (bits <= n) { n -= bits; word >>= 16; sum += 16; } bits = __const_hweight8(word); if (bits <= n) { n -= bits; word >>= 8; sum += 8; } bits = __const_hweight4(word); if (bits <= n) { n -= bits; word >>= 4; sum += 4; } bits = __const_hweight2(word); if (bits <= n) { n -= bits; word >>= 2; sum += 2; } bits = (unsigned int) (word & 0x1); if (bits <= n) { n -= bits; word >>= 1; sum += 1; } bits = (unsigned int) (word & 0x1); if (bits <= n) { n -= bits; sum += 1; } if(!n) return sum; return BITS_PER_LONG; } ``` ## 函式執行過程解釋 當 `n>0` 時,以下 block 為尋找 `n` 個 `set bit` ,不過事實上需找 `n+1` 個 `set bit` ,因此當 `n=0` 時,代表他還需要找一個,此時以下 block 則變為尋找全為 0 的 bits ,因此最後會抵達第 `n+1` 個 `set bit` 的右一位,但因為 `sum` 為從 0 開始往上加,因此會剛好等於 `n-th set bit` 的位置。 ```c bits = __const_hweightC(word & const); if (bits <= n) { n -= bits; sum += const; } ``` ``` 假設需找 7-th set bit word = 0x1010010101011011 8 7 6 5 4 32 10 以上為 set bit 的對應 index ,可以看到 7-th set bit 對應位置為 13 (0 起始) 而 __const_hweightC 會回傳範圍 C 有多少個 set bits 此時 n > 0 ====block 為尋找 n 個 set bit===== C = 16 回傳 9 > n => 不進入判斷式 C = 8 回傳 5 <= n => n-=5 sum+=8 word >>= 8 0x10100101 3 2 1 0 C = 4 回傳 2 <= n => n-=2 sum+=4 word >>= 4 0x1010 1 0 此時 n = 0 ====block 為尋找全為 0 的 bits===== C = 2 回傳 1 > n => 不進入判斷式 C = 1 回傳 0 <= n => n-=0 sum+=1 word >>= 1 0x101 1 0 C = 1 回傳 1 > n => 不進入判斷式 0x101 1 0 結束 n=0 sum=13 ``` 可以看到此時 `word` 的 `lsb` 為 `n-th set bit` ,且右移了 13 位,即為 `n-th set bit` 的 `index` 比較: 原式: `n+1` 次 `ffs` , `n` 次 `clear bit` ,而 `ffs` 使用 `log(BITS_PER_LONG)` 次判斷式,而 `clear bit` 為常數操作,因此最後會有 `(n+1)`\*`log(BITS_PER_LONG)` + `(n + 1)(外部 while 迴圈)` 次判斷式。 我的版本: `log(BITS_PER_LONG) + 1 (再做一次 & 0x1 去完整檢查 64 bit 的範圍)` + 1 次判斷式 因此 n 越大,差距應會越明顯 ## 測試 以下為測試程式碼: 可在這裡直接執行 [OnlineGDB](https://onlinegdb.com/m3bomRBlg) :::spoiler 測試程式碼 ```c #include <assert.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <time.h> /* Assume 64-bit architecture */ #define BITS_PER_LONG 64 #define ARRAY_SIZE 1000000 #define BITS_PER_BYTE 8 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) #define DIV_ROUND_UP(n, d) (((n) + (d) -1) / (d)) #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) #define __const_hweight8(w) \ ((unsigned int) ((!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))) + \ (!!((w) & (1ULL << 2))) + (!!((w) & (1ULL << 3))) + \ (!!((w) & (1ULL << 4))) + (!!((w) & (1ULL << 5))) + \ (!!((w) & (1ULL << 6))) + (!!((w) & (1ULL << 7))))) #define __const_hweight16(w) (__const_hweight8(w) + __const_hweight8((w) >> 8)) #define __const_hweight32(w) \ (__const_hweight16(w) + __const_hweight16((w) >> 16)) #define __const_hweight64(w) \ (__const_hweight32(w) + __const_hweight32((w) >> 32)) static inline unsigned long hweight_long(unsigned long w) { return __const_hweight64(w); } /* find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static inline unsigned long __ffs(unsigned long word) { int num = 0; #if BITS_PER_LONG == 64 if (!(word & 0xffffffff)) { num += 32; word >>= 32; } #endif if (!(word & 0xffff)) { num += 16; word >>= 16; } if (!(word & 0xff)) { num += 8; word >>= 8; } if (!(word & 0xf)) { num += 4; word >>= 4; } if (!(word & 0x3)) { num += 2; word >>= 2; } if ((word & 0x1) == 0) num += 1; return num; } static inline void __clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr); *p &= ~mask; } /* find N'th set bit in a word * @word: The word to search * @n: Bit to find */ static inline unsigned long fns(unsigned long word, unsigned int n) { while (word) { unsigned int bit = __ffs(word); if (n-- == 0) return bit; __clear_bit(bit, &word); } return BITS_PER_LONG; } #define __const_hweight2(w) \ ((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1)))) #define __const_hweight4(w) \ ((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))) + \ (!!((word) & (1ULL << 2))) + (!!((word) & (1ULL << 3)))) /* find N'th set bit in a word * @word: The word to search * @n: Bit to find */ static inline unsigned long fns2(unsigned long word, unsigned int n) { unsigned long sum = 0; unsigned int bits = 0; #if BITS_PER_LONG == 64 bits = __const_hweight32(word & 0xffffffff); if (bits <= n) { n -= bits; word >>= 32; sum += 32; } #endif bits = __const_hweight16(word & 0xffff); if (bits <= n) { n -= bits; word >>= 16; sum += 16; } bits = __const_hweight8(word & 0xff); if (bits <= n) { n -= bits; word >>= 8; sum += 8; } bits = __const_hweight4(word & 0xf); if (bits <= n) { n -= bits; word >>= 4; sum += 4; } bits = __const_hweight2(word & 0x3); if (bits <= n) { n -= bits; word >>= 2; sum += 2; } bits = (unsigned int) (word & 0x1); if (bits <= n) { n -= bits; word >>= 1; sum += 1; } bits = (unsigned int) (word & 0x1); if (bits <= n) { n -= bits; sum += 1; } if(!n) return sum; return BITS_PER_LONG; } struct cycletime { unsigned long cycle; double time; }; int main() { unsigned long limit = 2000000000; struct cycletime avgrec1 = {.cycle = 0, .time = 0.0}; struct cycletime avgrec2 = {.cycle = 0, .time = 0.0}; unsigned long test_t = 200; for(unsigned long idx = 0; idx < 64; idx++){ for(unsigned long a = 0; a < test_t; a++){ // Allocate memory for the array unsigned long *bit_array = malloc(BITS_TO_LONGS(ARRAY_SIZE) * sizeof(unsigned long)); if (!bit_array) { perror("Memory allocation failed"); return EXIT_FAILURE; } // Initialize random seed srand(a); unsigned long t = 0; // Set random bits for (unsigned long i = 0; i < ARRAY_SIZE; ++i) { unsigned long word_idx = BIT_WORD(i); unsigned long bit_idx = i & (BITS_PER_LONG - 1); if (rand() % 2) { bit_array[word_idx] |= (1UL << bit_idx); } else bit_array[word_idx] ^= (bit_array[word_idx] & (1UL << bit_idx)); } unsigned long bound = BIT_WORD(ARRAY_SIZE); unsigned long r1[bound]; unsigned long r2[bound]; clock_t start, end; unsigned long long cycles_start, cycles_end; start = clock(); asm volatile("rdtsc" : "=A" (cycles_start)); for (unsigned long i = 0; i < bound; ++i) { r1[i] = fns(bit_array[i], idx); } asm volatile("rdtsc" : "=A" (cycles_end)); end = clock(); avgrec1.time += (double)(end - start) / CLOCKS_PER_SEC; if((cycles_end - cycles_start) < limit) avgrec1.cycle += (cycles_end - cycles_start); // printf("original ver.=========================================================================================\n"); // printf("time : %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC); // printf("CPU cycles : %llu cycles\n", cycles_end - cycles_start); start = clock(); asm volatile("rdtsc" : "=A" (cycles_start)); for (unsigned long i = 0; i < bound; ++i) { r2[i] = fns2(bit_array[i], idx); } asm volatile("rdtsc" : "=A" (cycles_end)); end = clock(); avgrec2.time += (double)(end - start) / CLOCKS_PER_SEC; if((cycles_end - cycles_start) < limit) avgrec2.cycle += (cycles_end - cycles_start); // printf("new ver.==============================================================================================\n"); // printf("time : %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC); // printf("CPU cycles : %llu cycles\n", cycles_end - cycles_start); for (unsigned long i = 0; i < bound; ++i) { if(r2[i] != r1[i]){ printf("Different in %ld, %lu vs %lu\n", i, r1[i], r2[i]); } } free(bit_array); } // printf("original ver.=========================================================================================\n"); // printf("Average time : %f seconds\n", avgrec1.time/(test_t*idx)); // printf("Average CPU cycles : %lu cycles\n", avgrec1.cycle/(test_t*idx)); // printf("new ver.==============================================================================================\n"); // printf("Average time : %f seconds\n", avgrec2.time/(test_t*idx)); // printf("Average CPU cycles : %lu cycles\n", avgrec2.cycle/(test_t*idx)); } printf("original ver.=========================================================================================\n"); printf("Average time : %f seconds\n", avgrec1.time/(test_t*64)); printf("Average CPU cycles : %lu cycles\n", avgrec1.cycle/(test_t*64)); printf("new ver.==============================================================================================\n"); printf("Average time : %f seconds\n", avgrec2.time/(test_t*64)); printf("Average CPU cycles : %lu cycles\n", avgrec2.cycle/(test_t*64)); return EXIT_SUCCESS; } ``` ::: 其中: `test_t` 為測試輪數,即尋找 `n-th set bit` 多少次 `idx` 為 `n-th set bit` `ARRAY_SIZE` 為一連續記憶體空間,因此每一輪中皆會送入 `1000000/64` 個 `long` 長度的 `word` 去尋找 `n-th set bit` 測試過程: 1. 配置長度為 `1000000` 的記憶體空間並隨機填入 `1` 或 `0` 2. 將每一個 `long` (這裡為`64` bits) 長度的記憶體位置傳入 `fns` 作計算,同時為了公平,兩比較函式的每一輪 (`1000000/64`) 輸入都相同,並為了更好找到平均值,每一次尋找的 set bits 皆尋找 200 次,而 set bits 為能更全面,範圍為 0~63 。 3. 共執行 (`1000000/64`)\*`64`\*`200` 次 4. 同時為避免快但錯誤,在得到結果後會將兩者輸出做比較。 ## 結果(每次可能不太相同,但仍有明顯差異,不開啟 O2 選項編譯的情況): ``` 皆為尋找 長度 1000000 的記憶體位置 每一個 word 內的 n-th set bit 測量 32 bit original ver.============================ Average time : 0.005153 seconds Average CPU cycles : 18478354 cycles new ver.================================= Average time : 0.000790 seconds Average CPU cycles : 2843377 cycles 測量 64 bit original ver.============================ Average time : 0.007763 seconds Average CPU cycles : 27781828 cycles new ver.================================= Average time : 0.000729 seconds Average CPU cycles : 2620535 cycles ``` 最後輸出正確 可以看到我改寫的版本在平均花費時間 (6.52x faster、10.64x faster) 與 cycle 數 (6.5x less、10.60x less) 皆比 [linux/include/linux/bitops.h](https://github.com/torvalds/linux/blob/8f2c057754b25075aa3da132cd4fd4478cdab854/include/linux/bitops.h#L255) 裡的 `fns` 還要更佳 且也證明了 `n` 越大,差距越大,這裡因為是做 `0 ~ 31` 及 `0 ~ 63` ,所以會被平均,不過仍可以看到 `n` 增大會使平均時間增加。 而 CPU cycles 會降低我認為是 branch miss 次數降低,隨著 `n` 加大,代表需要找的 set bits 更多,在我的版本中會更容易進入判斷式 (只要 `bits <= n` ,因為 `n` 加大) ,不過 Linux 核心內的版本就沒那麼幸運,因為他只是要找最低位的 set bit 位置,而與 `n` 的設定沒有關係,因此 branch prediction 很可能猜錯,透過結果也可以猜出 branch 大致上以預測為進入判斷式為主。 我的程式碼的優勢: 1. 更快的執行時間,在 n 更大時,會有更大的差距 2. 使用更少的判斷式 3. 無需呼叫外部函示 因為並沒有實驗 branch prediction miss 的次數,因此不提到。 ## 使用 `perf` 做比較 目前可以看到 branch-misses 是不支援的,[原因](https://stackoverflow.com/questions/22712956/why-does-perf-stat-show-stalled-cycles-backend-as-not-supported)似乎出自 CPU,等待我在其他裝置上嘗試 ``` Performance counter stats for './fns': <not supported> cycles 326980.52 msec cpu-clock # 1.000 CPUs utilized 326949836000 ns user_time # 999.906 M/sec 7998000 ns system_time # 24.460 K/sec 326981.93 msec task-clock # 1.000 CPUs utilized 327019524641 ns duration_time # 1.000 G/sec <not supported> instructions <not supported> branches <not supported> branch-misses 327.019524641 seconds time elapsed 326.949836000 seconds user 0.007998000 seconds sys ``` 使用 `$ perf record -e cycles ./fns` ``` Samples: 1M of event 'cpu-clock:uH', Event count (approx.): 393692500000 Overhead Com Shared Object Symbol 31.54% fns fns [.] main 27.05% fns fns [.] __ffs 10.55% fns libc.so.6 [.] __random 8.62% fns libc.so.6 [.] __random_r 3.98% fns libc.so.6 [.] __aarch64_swp4_rel 3.72% fns libc.so.6 [.] __aarch64_cas4_acq 3.52% fns fns [.] fns2 3.21% fns fns [.] fns 2.89% fns fns [.] rand@plt 2.53% fns fns [.] __clear_bit ``` 可以看到我的版本 `fns2` 較使用 `__ffs` 及 `__clear_bit` 的 `fns` ,使用更少的 `cpu-clock` 原圖來看,原始 fns branch prediction miss 的次數佔比非常高 ![截圖 2024-04-15 下午2.48.15](https://hackmd.io/_uploads/Hksnar5lC.png) 而我的版本在佔比最高的地方也並非是 branch prediction ![截圖 2024-04-15 下午2.53.19](https://hackmd.io/_uploads/SkCk18cx0.png) ## 問題: 1. 我留意到你提供的 online GDB 連結中並沒有添加任何編譯參數,但是 Linux 核心通常會在開啟 O2 選項的情況下編譯,這很可能會大幅影響所測量到的數值,不確定你測試時所採用的編譯參數為何? 起初並未知道此問題,因此僅以 `$ gcc -c fns.c -o fns.o` 進行編譯 補充實驗 使用以下編譯指令,並使用虛擬機器 ubuntu 22.04 執行 ``` $ gcc -c -O2 fns.c -o fns.o $ gcc fns.o -o fns $ ./fns ``` ``` original ver.========================================== Average time : 0.002114 seconds Average time CPU cycles : 7620518 cycles new ver.=============================================== Average time : 0.000349 seconds Average time CPU cycles : 1267415 cycles ``` 可以看到效能差異減小至平均花費時間 (6.05x faster) 與 cycle 數 (6.01x less) ,為原始數據約 60% ,不過還是表現較佳。 改以原生 Linux 環境執行 ``` original ver.========================================== Average time : 0.002162 seconds Average time CPU cycles : 4301711 cycles new ver.=============================================== Average time : 0.000343 seconds Average time CPU cycles : 682152 cycles ``` 可以看到效能差異不大但還是有提升 (6.30x faster) 與 cycle 數 (6.30x less) ,不過 cycle 數大幅減少。 2. 有些硬體有支援 ffs 操作,但有些硬體沒有支援所以採用軟體實作,但我目前沒看到針對軟體/硬體實作分別的測試結果。 想問是指 ffs 有分為軟體實作,如下 ```c static inline unsigned long __ffs(unsigned long word) { int num = 0; #if BITS_PER_LONG == 64 if (!(word & 0xffffffff)) { num += 32; word >>= 32; } #endif ... } ``` 及硬體實作,如下: ```c static inline unsigned long __ffs(unsigned long x) { int ret; __asm__ ("l.ff1 %0,%1" : "=r" (ret) : "r" (x)); return ret-1; } ``` 而硬體實作是以 inline assembly 實作,需要比較我的版本與硬體實作版本的差異嗎? 目前使用虛擬機器及原生 Linux 環境皆會出現以下錯誤 ``` fns.c: Assembler messages: fns.c:58: Error: no such instruction: `l.ff1 %ecx,%ecx' fns.c:58: Error: no such instruction: `l.ff1 %ecx,%ecx' ``` 然而我無論如何都查不到 `l.ff1 %0,%1` 的 assembly instruction。 以下為 `__ffs` 的 inline, ```c #ifdef CONFIG_OPENRISC_HAVE_INST_FF1 static inline unsigned long __ffs(unsigned long x) { int ret; __asm__ ("l.ff1 %0,%1" : "=r" (ret) : "r" (x)); return ret-1; } #else #include <asm-generic/bitops/__ffs.h> #endif ``` 不過就找不到定義 `CONFIG_OPENRISC_HAVE_INST_FF1` 的地方了 ## 最後結果: 若 `__ffs` 為使用 builtin funtion ,那麼結果反而會較差,我後來認為是即使判斷式較少,但因計算 1 的數量的程式碼為不斷地將 1 左移並做 & 運算,因此 cycle 數反而超過了少量判斷式省下來的 cycle 數而提升。因此邱冠維同學提出了更好的方法,經過與 linux 核心開發者及另一位 bitmap 的 reviewer 討論過後,目前可能的程式碼為 ```c static inline unsigned long fns(unsigned long word, unsigned int n) { while (word && n--) word &= word - 1; return word ? __ffs(word) : BITS_PER_LONG; } ``` 此方法利用了 `Brian Kernighan’s Algorithm` ,可以僅做一次 `word &= word - 1` 就達成原程式碼的 clear bit ,假如一 word 為 `0b1000000` ,那麼僅須一次 `word &= word - 1` 即可以等同清除一位 set bit 而無須先透過 `__ffs` 計算最低位 set bit 並作清除的動作。但我的方法須使用以下程式碼做計算並得出 一定範圍內的 set bit ,而且會有**重複計算相同範圍的問題**(假設 32 位內的 set bit 過多,那麼不會進入判斷式,且在下一步會計算 16 位內的 set bit ,可以發現其實是會重複算最後的範圍) ```c #define __const_hweight8(w) \ ((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))) + \ (!!((word) & (1ULL << 2))) + (!!((word) & (1ULL << 3)))) + \ (!!((word) & (1ULL << 4))) + (!!((word) & (1ULL << 5)))) + \ (!!((word) & (1ULL << 6))) + (!!((word) & (1ULL << 7)))) ``` 想當然前者會更為高效,不過當 n 越大時 (且要有足夠的 n ),會有相反的結果,以下為結果,其中 original 為邱同學的程式碼,而 new 則為我的: ``` Test n: 0 original ver.========================================== Average time : 0.000021 seconds Average time CPU cycles : 68181 cycles new ver.=============================================== Average time : 0.000201 seconds Average time CPU cycles : 720986 cycles Test n: 1 original ver.========================================== Average time : 0.000023 seconds Average time CPU cycles : 75810 cycles new ver.=============================================== Average time : 0.000238 seconds Average time CPU cycles : 855627 cycles Test n: 2 original ver.========================================== Average time : 0.000026 seconds Average time CPU cycles : 87842 cycles new ver.=============================================== Average time : 0.000254 seconds Average time CPU cycles : 913202 cycles Test n: 3 original ver.========================================== Average time : 0.000031 seconds Average time CPU cycles : 103377 cycles new ver.=============================================== Average time : 0.000275 seconds Average time CPU cycles : 987046 cycles Test n: 4 original ver.========================================== Average time : 0.000032 seconds Average time CPU cycles : 110543 cycles new ver.=============================================== Average time : 0.000274 seconds Average time CPU cycles : 983954 cycles Test n: 5 original ver.========================================== Average time : 0.000042 seconds Average time CPU cycles : 144670 cycles new ver.=============================================== Average time : 0.000280 seconds Average time CPU cycles : 1004897 cycles Test n: 6 original ver.========================================== Average time : 0.000041 seconds Average time CPU cycles : 141425 cycles new ver.=============================================== Average time : 0.000295 seconds Average time CPU cycles : 1061731 cycles Test n: 7 original ver.========================================== Average time : 0.000046 seconds Average time CPU cycles : 161445 cycles new ver.=============================================== Average time : 0.000308 seconds Average time CPU cycles : 1107051 cycles Test n: 8 original ver.========================================== Average time : 0.000052 seconds Average time CPU cycles : 179576 cycles new ver.=============================================== Average time : 0.000313 seconds Average time CPU cycles : 1121208 cycles Test n: 9 original ver.========================================== Average time : 0.000055 seconds Average time CPU cycles : 193132 cycles new ver.=============================================== Average time : 0.000315 seconds Average time CPU cycles : 1130842 cycles Test n: 10 original ver.========================================== Average time : 0.000060 seconds Average time CPU cycles : 210818 cycles new ver.=============================================== Average time : 0.000324 seconds Average time CPU cycles : 1164863 cycles Test n: 11 original ver.========================================== Average time : 0.000065 seconds Average time CPU cycles : 228608 cycles new ver.=============================================== Average time : 0.000336 seconds Average time CPU cycles : 1206329 cycles Test n: 12 original ver.========================================== Average time : 0.000071 seconds Average time CPU cycles : 251015 cycles new ver.=============================================== Average time : 0.000350 seconds Average time CPU cycles : 1257303 cycles Test n: 13 original ver.========================================== Average time : 0.000076 seconds Average time CPU cycles : 266143 cycles new ver.=============================================== Average time : 0.000353 seconds Average time CPU cycles : 1267077 cycles Test n: 14 original ver.========================================== Average time : 0.000081 seconds Average time CPU cycles : 284067 cycles new ver.=============================================== Average time : 0.000362 seconds Average time CPU cycles : 1298368 cycles Test n: 15 original ver.========================================== Average time : 0.000087 seconds Average time CPU cycles : 306249 cycles new ver.=============================================== Average time : 0.000376 seconds Average time CPU cycles : 1351340 cycles Test n: 16 original ver.========================================== Average time : 0.000093 seconds Average time CPU cycles : 328772 cycles new ver.=============================================== Average time : 0.000378 seconds Average time CPU cycles : 1359326 cycles Test n: 17 original ver.========================================== Average time : 0.000098 seconds Average time CPU cycles : 345717 cycles new ver.=============================================== Average time : 0.000370 seconds Average time CPU cycles : 1329794 cycles Test n: 18 original ver.========================================== Average time : 0.000106 seconds Average time CPU cycles : 373836 cycles new ver.=============================================== Average time : 0.000380 seconds Average time CPU cycles : 1363232 cycles Test n: 19 original ver.========================================== Average time : 0.000109 seconds Average time CPU cycles : 385798 cycles new ver.=============================================== Average time : 0.000382 seconds Average time CPU cycles : 1372295 cycles Test n: 20 original ver.========================================== Average time : 0.000114 seconds Average time CPU cycles : 404843 cycles new ver.=============================================== Average time : 0.000382 seconds Average time CPU cycles : 1371155 cycles Test n: 21 original ver.========================================== Average time : 0.000124 seconds Average time CPU cycles : 440045 cycles new ver.=============================================== Average time : 0.000397 seconds Average time CPU cycles : 1426729 cycles Test n: 22 original ver.========================================== Average time : 0.000128 seconds Average time CPU cycles : 453378 cycles new ver.=============================================== Average time : 0.000395 seconds Average time CPU cycles : 1419189 cycles Test n: 23 original ver.========================================== Average time : 0.000134 seconds Average time CPU cycles : 475297 cycles new ver.=============================================== Average time : 0.000416 seconds Average time CPU cycles : 1493521 cycles Test n: 24 original ver.========================================== Average time : 0.000144 seconds Average time CPU cycles : 509954 cycles new ver.=============================================== Average time : 0.000421 seconds Average time CPU cycles : 1511201 cycles Test n: 25 original ver.========================================== Average time : 0.000151 seconds Average time CPU cycles : 536158 cycles new ver.=============================================== Average time : 0.000419 seconds Average time CPU cycles : 1491608 cycles Test n: 26 original ver.========================================== Average time : 0.000166 seconds Average time CPU cycles : 591555 cycles new ver.=============================================== Average time : 0.000429 seconds Average time CPU cycles : 1539124 cycles Test n: 27 original ver.========================================== Average time : 0.000176 seconds Average time CPU cycles : 627527 cycles new ver.=============================================== Average time : 0.000424 seconds Average time CPU cycles : 1523694 cycles Test n: 28 original ver.========================================== Average time : 0.000193 seconds Average time CPU cycles : 687868 cycles new ver.=============================================== Average time : 0.000417 seconds Average time CPU cycles : 1496883 cycles Test n: 29 original ver.========================================== Average time : 0.000215 seconds Average time CPU cycles : 768329 cycles new ver.=============================================== Average time : 0.000410 seconds Average time CPU cycles : 1473115 cycles Test n: 30 original ver.========================================== Average time : 0.000250 seconds Average time CPU cycles : 891316 cycles new ver.=============================================== Average time : 0.000407 seconds Average time CPU cycles : 1460716 cycles Test n: 31 original ver.========================================== Average time : 0.000269 seconds Average time CPU cycles : 963115 cycles new ver.=============================================== Average time : 0.000387 seconds Average time CPU cycles : 1389422 cycles Test n: 32 original ver.========================================== Average time : 0.000289 seconds Average time CPU cycles : 1032171 cycles new ver.=============================================== Average time : 0.000370 seconds Average time CPU cycles : 1329986 cycles Test n: 33 original ver.========================================== Average time : 0.000305 seconds Average time CPU cycles : 1091329 cycles new ver.=============================================== Average time : 0.000353 seconds Average time CPU cycles : 1268201 cycles Test n: 34 original ver.========================================== Average time : 0.000325 seconds Average time CPU cycles : 1163132 cycles new ver.=============================================== Average time : 0.000338 seconds Average time CPU cycles : 1213680 cycles Test n: 35 original ver.========================================== Average time : 0.000346 seconds Average time CPU cycles : 1239294 cycles new ver.=============================================== Average time : 0.000321 seconds Average time CPU cycles : 1153416 cycles Test n: 36 original ver.========================================== Average time : 0.000356 seconds Average time CPU cycles : 1274277 cycles new ver.=============================================== Average time : 0.000308 seconds Average time CPU cycles : 1106007 cycles Test n: 37 original ver.========================================== Average time : 0.000362 seconds Average time CPU cycles : 1298737 cycles new ver.=============================================== Average time : 0.000295 seconds Average time CPU cycles : 1060095 cycles Test n: 38 original ver.========================================== Average time : 0.000366 seconds Average time CPU cycles : 1311189 cycles new ver.=============================================== Average time : 0.000284 seconds Average time CPU cycles : 1020473 cycles Test n: 39 original ver.========================================== Average time : 0.000370 seconds Average time CPU cycles : 1324439 cycles new ver.=============================================== Average time : 0.000281 seconds Average time CPU cycles : 1009790 cycles Test n: 40 original ver.========================================== Average time : 0.000374 seconds Average time CPU cycles : 1340945 cycles new ver.=============================================== Average time : 0.000293 seconds Average time CPU cycles : 1050948 cycles Test n: 41 original ver.========================================== Average time : 0.000373 seconds Average time CPU cycles : 1334657 cycles new ver.=============================================== Average time : 0.000279 seconds Average time CPU cycles : 1002511 cycles Test n: 42 original ver.========================================== Average time : 0.000374 seconds Average time CPU cycles : 1340435 cycles new ver.=============================================== Average time : 0.000280 seconds Average time CPU cycles : 1005596 cycles Test n: 43 original ver.========================================== Average time : 0.000370 seconds Average time CPU cycles : 1324166 cycles new ver.=============================================== Average time : 0.000276 seconds Average time CPU cycles : 992143 cycles Test n: 44 original ver.========================================== Average time : 0.000369 seconds Average time CPU cycles : 1322804 cycles new ver.=============================================== Average time : 0.000277 seconds Average time CPU cycles : 1002429 cycles Test n: 45 original ver.========================================== Average time : 0.000375 seconds Average time CPU cycles : 1340341 cycles new ver.=============================================== Average time : 0.000283 seconds Average time CPU cycles : 1015056 cycles Test n: 46 original ver.========================================== Average time : 0.000367 seconds Average time CPU cycles : 1313704 cycles new ver.=============================================== Average time : 0.000275 seconds Average time CPU cycles : 996220 cycles Test n: 47 original ver.========================================== Average time : 0.000368 seconds Average time CPU cycles : 1317041 cycles new ver.=============================================== Average time : 0.000277 seconds Average time CPU cycles : 994093 cycles Test n: 48 original ver.========================================== Average time : 0.000365 seconds Average time CPU cycles : 1309510 cycles new ver.=============================================== Average time : 0.000273 seconds Average time CPU cycles : 980609 cycles Test n: 49 original ver.========================================== Average time : 0.000368 seconds Average time CPU cycles : 1318358 cycles new ver.=============================================== Average time : 0.000274 seconds Average time CPU cycles : 984556 cycles Test n: 50 original ver.========================================== Average time : 0.000370 seconds Average time CPU cycles : 1326410 cycles new ver.=============================================== Average time : 0.000276 seconds Average time CPU cycles : 982680 cycles Test n: 51 original ver.========================================== Average time : 0.000366 seconds Average time CPU cycles : 1312605 cycles new ver.=============================================== Average time : 0.000273 seconds Average time CPU cycles : 978370 cycles Test n: 52 original ver.========================================== Average time : 0.000368 seconds Average time CPU cycles : 1316405 cycles new ver.=============================================== Average time : 0.000277 seconds Average time CPU cycles : 995060 cycles Test n: 53 original ver.========================================== Average time : 0.000370 seconds Average time CPU cycles : 1324702 cycles new ver.=============================================== Average time : 0.000277 seconds Average time CPU cycles : 994500 cycles Test n: 54 original ver.========================================== Average time : 0.000372 seconds Average time CPU cycles : 1333373 cycles new ver.=============================================== Average time : 0.000276 seconds Average time CPU cycles : 990131 cycles Test n: 55 original ver.========================================== Average time : 0.000369 seconds Average time CPU cycles : 1323664 cycles new ver.=============================================== Average time : 0.000281 seconds Average time CPU cycles : 1007539 cycles Test n: 56 original ver.========================================== Average time : 0.000375 seconds Average time CPU cycles : 1340914 cycles new ver.============================================== Average time : 0.000278 seconds Average time CPU cycles : 996436 cycles Test n: 57 original ver.========================================== Average time : 0.000374 seconds Average time CPU cycles : 1338532 cycles new ver.=============================================== Average time : 0.000277 seconds Average time CPU cycles : 994950 cycles Test n: 58 original ver.========================================== Average time : 0.000376 seconds Average time CPU cycles : 1344455 cycles new ver.=============================================== Average time : 0.000278 seconds Average time CPU cycles : 997226 cycles Test n: 59 original ver.========================================== Average time : 0.000373 seconds Average time CPU cycles : 1334166 cycles new ver.=============================================== Average time : 0.000276 seconds Average time CPU cycles : 979184 cycles Test n: 60 original ver.========================================== Average time : 0.000373 seconds Average time CPU cycles : 1336970 cycles new ver.=============================================== Average time : 0.000280 seconds Average time CPU cycles : 1006646 cycles Test n: 61 original ver.========================================== Average time : 0.000375 seconds Average time CPU cycles : 1341625 cycles new ver.=============================================== Average time : 0.000285 seconds Average time CPU cycles : 1026516 cycles Test n: 62 original ver.========================================== Average time : 0.000373 seconds Average time CPU cycles : 1333731 cycles new ver.=============================================== Average time : 0.000277 seconds Average time CPU cycles : 993799 cycles Test n: 63 original ver.========================================== Average time : 0.000374 seconds Average time CPU cycles : 1340000 cycles new ver.=============================================== Average time : 0.000286 seconds Average time CPU cycles : 1025193 cycles ``` 經由以上實驗,可以看到在 n 為 35 時,我的程式碼會開始達到較好的效能,不過因為我的程式碼計算過程非常固定,因此可以達到較為常數時間的執行時間,不會有太大幅度的變動,但邱同學的程式碼會與 n 的值成正比,最後會緩和則是因為我是採以隨機填入 1 或 0 的方式, set bits 的期望值應會約為 32 ,因此邱同學的版本花費時間未再上升。 雖然我的程式碼在 n 偏大時表現較佳,不過與版本相比在 n 篇小時會較差是最大的問題。 ## 心得 透過這次 Jserv 老師與邱同學的討論對我有莫大的幫助,包括改進需做嚴謹的實驗是必須的,且參考網路上現有的程式碼是必要的,雖然之前在課堂作業有看過 Brian Kernighan’s Algorithm 的使用,此次改進時卻完全沒想到,但其實一上網搜尋 count set bits 就可以看到這個演算法,我甚至認為少量的判斷式就是最佳的,不過透過結果可以看到這是不成立的。這次我也深知自己能力的不足,學到如何向他人表達你的程式碼之所以更好,並能與他人進行有效率的溝通。 <!-- 相關與 Linux 核心開發者討論 [這裡](https://mail.google.com/mail/u/1/?ik=75069e04e5&view=pt&search=all&permthid=thread-f:1797464826894404805&simpl=msg-f:1797464826894404805&simpl=msg-f:1797595341939973051&simpl=msg-f:1797655536150274489&simpl=msg-f:1797683603239017549&simpl=msg-f:1797684187377995497&simpl=msg-f:1797687063530644928&simpl=msg-f:1797687553500865377&simpl=msg-f:1797689577552236236) --> 最後謝謝 Jserv 及 邱冠維同學 ## lookup table 及其他版本 ```c unsigned int lut[16][4] = {{4, 4, 4, 4}, {16, 4, 4, 4}, {17, 4, 4, 4}, {32, 1, 4, 4}, \ {18, 4, 4, 4}, {32, 2, 4, 4}, {33, 2, 4, 4}, {48, 1, 2, 4}, \ {19, 4, 4, 4}, {32, 3, 4, 4}, {33, 3, 4, 4}, {48, 1, 3, 4}, \ {34, 3, 4, 4}, {48, 2, 3, 4}, {49, 2, 3, 4}, {64, 1, 2, 3}}; static inline unsigned int hweight4(uint8_t word) { return lut[word & 0xf][0] >> 4; } static inline unsigned long fns4(uint8_t word, unsigned int n) { return lut[word & 0xf][n] & 0xf; } static inline unsigned long fns8(uint8_t word, unsigned int n) { unsigned int w = hweight4(word); return n < w ? fns4(word, n) : 4 + fns4(word >> 4, n - w); } static inline unsigned long fns16(uint16_t word, unsigned int n) { unsigned int w = __const_hweight8((uint8_t)word); return n < w ? fns8((uint8_t)word, n) : 8 + fns8((uint8_t)(word >> 8), n - w); } static inline unsigned long fns32(uint32_t word, unsigned int n) { unsigned int w = __const_hweight16((uint16_t)word); return n < w ? fns16((uint16_t)word, n) : 16 + fns16((uint16_t)(word >> 16), n - w); } static inline unsigned long fns64(uint64_t word, unsigned int n) { unsigned int w = __const_hweight32((uint32_t)word); return n < w ? fns32((uint32_t)word, n) : 32 + fns32((uint32_t)(word >> 32), n - w); } /** * fns - find N'th set bit in a word * @word: The word to search * @n: Bit to find * @n: Bit to find, counting from 0 * * Returns N'th set bit. If no such bit found, returns >= BITS_PER_LONG */ static inline unsigned long fns(unsigned long word, unsigned int n) { #if BITS_PER_LONG == 64 return fns64(word, n); #else return fns32(word, n); #endif } ``` Lookup table 版本1 不使用前面就計算過的版本 ```c satic inline unsigned long fns(unsigned long word, unsigned int n) { unsigned int pos = 0; unsigned int bits = 0; #if BITS_PER_LONG == 64 // find 32 bits bits = __const_hweight32(word); if (bits <= n) { n -= bits; word >>= 32; pos += 32; } #endif bits = __const_hweight16(word); if (bits <= n) { n -= bits; word >>= 16; pos += 16; } bits = __const_hweight8(word); if (bits <= n) { n -= bits; word >>= 8; pos += 8; } return n<8? fbs_table[word & 0xff][n] + pos: BITS_PER_LONG+1; } ``` Lookup table 版本2 使用前面計算過的版本 ```c static inline unsigned long fns(unsigned long word, unsigned int n) { unsigned int pos = 0; unsigned int bits_higher_higher = 0; unsigned int bits_higher = 0; unsigned int bits_lower = 0; unsigned int bits_lower_lower = 0; unsigned int bits = 0; #if BITS_PER_LONG == 64 // find 32 bits bits_higher_higher = __const_hweight8(word>>24); bits_higher = __const_hweight8(word>>16); bits_lower = __const_hweight8(word>>8); bits_lower_lower = __const_hweight8(word); bits = bits_higher_higher+bits_higher+bits_lower+bits_lower_lower; if (bits <= n) n -= bits; word >>= 32; pos += 32; // update 16 bits bits_lower = __const_hweight8(word>>8); bits_lower_lower = __const_hweight8(word); } #else // find 16 bits bits_lower = __const_hweigh8(word>>8); bits_lower_lower = __const_hweight8(word); #endif bits = bits_lower+bits_lower_lower; if (bits <= n) { n -= bits; word >>= 16; pos += 16; bits_lower_lower = __const_hweight8(word); } bits = bits_lower_lower; if (bits <= n) { n -= bits; word >>= 8; pos += 8; } return n<8? fbs_table[word & 0xff][n] + pos: BITS_PER_LONG+1; } ``` binary search 版本 1 不使用前面就計算過的版本 ```c static inline unsigned long fns(unsigned long word, unsigned int n) { unsigned int sum = 0; unsigned int bits = 0; #if BITS_PER_LONG == 64 bits = __const_hweight32(word); if (bits <= n) { n -= bits; word >>= 32; sum += 32; } #endif bits = __const_hweight16(word); if (bits <= n) { n -= bits; word >>= 16; sum += 16; } bits = __const_hweight8(word); if (bits <= n) { n -= bits; word >>= 8; sum += 8; } bits = __const_hweight4(word); if (bits <= n) { n -= bits; word >>= 4; sum += 4; } bits = __const_hweight2(word); if (bits <= n) { n -= bits; word >>= 2; sum += 2; } bits = (unsigned int) (word & 0x1); if (bits <= n) { n -= bits; word >>= 1; sum += 1; } bits = (unsigned int) (word & 0x1); if (bits <= n) { n -= bits; sum += 1; } if(!n) return sum; return BITS_PER_LONG+1; } ``` binary search 版本 2 使用前面計算過的版本 ```c static inline unsigned long fns(unsigned long word, unsigned int n) { unsigned int sum = 0; unsigned int bits_higher = 0; unsigned int bits_lower = 0; unsigned int bits = 0; #if BITS_PER_LONG == 64 // find 32 bits bits_higher = __const_hweight16(word>>16); bits_lower = __const_hweight16(word); bits = bits_higher+bits_lower; if (bits <= n) { n -= bits; word >>= 32; sum += 32; // update 16 bits bits_lower = __const_hweight16(word); } #else // find 16 bits bits_lower = __const_hweight16(word) #endif if (bits_lower <= n) { n -= bits_lower; word >>= 16; sum += 16; } // find 8 bits bits_higher = __const_hweight4(word>>4); bits_lower = __const_hweight4(word); bits = bits_higher+bits_lower; if (bits <= n) { n -= bits; word >>= 8; sum += 8; // update 4 bits bits_lower = __const_hweight4(word); } if (bits_lower <= n) { n -= bits_lower; word >>= 4; sum += 4; } // find 2 bits bits_higher = (unsigned int) ((word>>1) & 0x1); // 0 bits_lower = (unsigned int) (word & 0x1); // 1 bits = bits_higher+bits_lower; // 1 if (bits <= n) { n -= bits; word >>= 2; sum += 2; // update 1 bit bits_lower = (unsigned int) (word & 0x1); } if (bits_lower <= n) { n -= bits_lower; word >>= 1; sum += 1; // update 1 bit bits_lower = (unsigned int) (word & 0x1); } if (bits_lower <= n) { n -= bits_lower; sum += 1; } if(!n) return sum; return BITS_PER_LONG+1; } ```