# FNS 改進
## `fns` 函式
**函式定義:**
find N'th set bit in a word ,即在一 word 長度中的記憶體尋找第 n 個 set bits
**函式作法:**
1. 利用 `__ffs` 尋找低位開始第一個 set bit 之位置
2. 若此時的 n 不為 0 ,則利用 `__clear_bit` 清除該位 (設為 0) ,再回到 1.
原程式碼:
```c
static inline void __clear_bit(unsigned long nr, volatile unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
*p &= ~mask;
}
/* find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
while (word) {
unsigned int bit = __ffs(word);
if (n-- == 0)
return bit;
__clear_bit(bit, &word);
}
return BITS_PER_LONG;
}
```
改成:
需要注意的是,我加入了原先程式碼沒有的 macro (原本只到 __const_hweight8 ),只是為了美觀,如下:
```c
#define __const_hweight2(w) \
((unsigned int) (!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))))
#define __const_hweight4(w) \
((unsigned int) (!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))) + \
(!!((w) & (1ULL << 2))) + (!!((w) & (1ULL << 3))))
```
```c
/* find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
unsigned int sum = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
bits = __const_hweight32(word);
if (bits <= n) {
n -= bits;
word >>= 32;
sum += 32;
}
#endif
bits = __const_hweight16(word);
if (bits <= n) {
n -= bits;
word >>= 16;
sum += 16;
}
bits = __const_hweight8(word);
if (bits <= n) {
n -= bits;
word >>= 8;
sum += 8;
}
bits = __const_hweight4(word);
if (bits <= n) {
n -= bits;
word >>= 4;
sum += 4;
}
bits = __const_hweight2(word);
if (bits <= n) {
n -= bits;
word >>= 2;
sum += 2;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
word >>= 1;
sum += 1;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
sum += 1;
}
if(!n)
return sum;
return BITS_PER_LONG;
}
```
## 函式執行過程解釋
當 `n>0` 時,以下 block 為尋找 `n` 個 `set bit` ,不過事實上需找 `n+1` 個 `set bit` ,因此當 `n=0` 時,代表他還需要找一個,此時以下 block 則變為尋找全為 0 的 bits ,因此最後會抵達第 `n+1` 個 `set bit` 的右一位,但因為 `sum` 為從 0 開始往上加,因此會剛好等於 `n-th set bit` 的位置。
```c
bits = __const_hweightC(word & const);
if (bits <= n) {
n -= bits;
sum += const;
}
```
```
假設需找 7-th set bit
word =
0x1010010101011011
8 7 6 5 4 32 10
以上為 set bit 的對應 index ,可以看到 7-th set bit 對應位置為 13 (0 起始)
而 __const_hweightC 會回傳範圍 C 有多少個 set bits
此時 n > 0
====block 為尋找 n 個 set bit=====
C = 16 回傳 9 > n => 不進入判斷式
C = 8 回傳 5 <= n => n-=5 sum+=8
word >>= 8
0x10100101
3 2 1 0
C = 4 回傳 2 <= n => n-=2 sum+=4
word >>= 4
0x1010
1 0
此時 n = 0
====block 為尋找全為 0 的 bits=====
C = 2 回傳 1 > n => 不進入判斷式
C = 1 回傳 0 <= n => n-=0 sum+=1
word >>= 1
0x101
1 0
C = 1 回傳 1 > n => 不進入判斷式
0x101
1 0
結束
n=0 sum=13
```
可以看到此時 `word` 的 `lsb` 為 `n-th set bit` ,且右移了 13 位,即為 `n-th set bit` 的 `index`
比較:
原式: `n+1` 次 `ffs` , `n` 次 `clear bit` ,而 `ffs` 使用 `log(BITS_PER_LONG)` 次判斷式,而 `clear bit` 為常數操作,因此最後會有 `(n+1)`\*`log(BITS_PER_LONG)` + `(n + 1)(外部 while 迴圈)` 次判斷式。
我的版本: `log(BITS_PER_LONG) + 1 (再做一次 & 0x1 去完整檢查 64 bit 的範圍)` + 1 次判斷式
因此 n 越大,差距應會越明顯
## 測試
以下為測試程式碼:
可在這裡直接執行 [OnlineGDB](https://onlinegdb.com/m3bomRBlg)
:::spoiler 測試程式碼
```c
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
/* Assume 64-bit architecture */
#define BITS_PER_LONG 64
#define ARRAY_SIZE 1000000
#define BITS_PER_BYTE 8
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define DIV_ROUND_UP(n, d) (((n) + (d) -1) / (d))
#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
#define __const_hweight8(w) \
((unsigned int) ((!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))) + \
(!!((w) & (1ULL << 2))) + (!!((w) & (1ULL << 3))) + \
(!!((w) & (1ULL << 4))) + (!!((w) & (1ULL << 5))) + \
(!!((w) & (1ULL << 6))) + (!!((w) & (1ULL << 7)))))
#define __const_hweight16(w) (__const_hweight8(w) + __const_hweight8((w) >> 8))
#define __const_hweight32(w) \
(__const_hweight16(w) + __const_hweight16((w) >> 16))
#define __const_hweight64(w) \
(__const_hweight32(w) + __const_hweight32((w) >> 32))
static inline unsigned long hweight_long(unsigned long w)
{
return __const_hweight64(w);
}
/* find first bit in word.
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsigned long word)
{
int num = 0;
#if BITS_PER_LONG == 64
if (!(word & 0xffffffff)) {
num += 32;
word >>= 32;
}
#endif
if (!(word & 0xffff)) {
num += 16;
word >>= 16;
}
if (!(word & 0xff)) {
num += 8;
word >>= 8;
}
if (!(word & 0xf)) {
num += 4;
word >>= 4;
}
if (!(word & 0x3)) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
}
static inline void __clear_bit(unsigned long nr, volatile unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
*p &= ~mask;
}
/* find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
while (word) {
unsigned int bit = __ffs(word);
if (n-- == 0)
return bit;
__clear_bit(bit, &word);
}
return BITS_PER_LONG;
}
#define __const_hweight2(w) \
((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))))
#define __const_hweight4(w) \
((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))) + \
(!!((word) & (1ULL << 2))) + (!!((word) & (1ULL << 3))))
/* find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
*/
static inline unsigned long fns2(unsigned long word, unsigned int n)
{
unsigned long sum = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
bits = __const_hweight32(word & 0xffffffff);
if (bits <= n) {
n -= bits;
word >>= 32;
sum += 32;
}
#endif
bits = __const_hweight16(word & 0xffff);
if (bits <= n) {
n -= bits;
word >>= 16;
sum += 16;
}
bits = __const_hweight8(word & 0xff);
if (bits <= n) {
n -= bits;
word >>= 8;
sum += 8;
}
bits = __const_hweight4(word & 0xf);
if (bits <= n) {
n -= bits;
word >>= 4;
sum += 4;
}
bits = __const_hweight2(word & 0x3);
if (bits <= n) {
n -= bits;
word >>= 2;
sum += 2;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
word >>= 1;
sum += 1;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
sum += 1;
}
if(!n)
return sum;
return BITS_PER_LONG;
}
struct cycletime
{
unsigned long cycle;
double time;
};
int main()
{
unsigned long limit = 2000000000;
struct cycletime avgrec1 = {.cycle = 0, .time = 0.0};
struct cycletime avgrec2 = {.cycle = 0, .time = 0.0};
unsigned long test_t = 200;
for(unsigned long idx = 0; idx < 64; idx++){
for(unsigned long a = 0; a < test_t; a++){
// Allocate memory for the array
unsigned long *bit_array = malloc(BITS_TO_LONGS(ARRAY_SIZE) * sizeof(unsigned long));
if (!bit_array) {
perror("Memory allocation failed");
return EXIT_FAILURE;
}
// Initialize random seed
srand(a);
unsigned long t = 0;
// Set random bits
for (unsigned long i = 0; i < ARRAY_SIZE; ++i) {
unsigned long word_idx = BIT_WORD(i);
unsigned long bit_idx = i & (BITS_PER_LONG - 1);
if (rand() % 2) {
bit_array[word_idx] |= (1UL << bit_idx);
}
else
bit_array[word_idx] ^= (bit_array[word_idx] & (1UL << bit_idx));
}
unsigned long bound = BIT_WORD(ARRAY_SIZE);
unsigned long r1[bound];
unsigned long r2[bound];
clock_t start, end;
unsigned long long cycles_start, cycles_end;
start = clock();
asm volatile("rdtsc" : "=A" (cycles_start));
for (unsigned long i = 0; i < bound; ++i) {
r1[i] = fns(bit_array[i], idx);
}
asm volatile("rdtsc" : "=A" (cycles_end));
end = clock();
avgrec1.time += (double)(end - start) / CLOCKS_PER_SEC;
if((cycles_end - cycles_start) < limit)
avgrec1.cycle += (cycles_end - cycles_start);
// printf("original ver.=========================================================================================\n");
// printf("time : %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
// printf("CPU cycles : %llu cycles\n", cycles_end - cycles_start);
start = clock();
asm volatile("rdtsc" : "=A" (cycles_start));
for (unsigned long i = 0; i < bound; ++i) {
r2[i] = fns2(bit_array[i], idx);
}
asm volatile("rdtsc" : "=A" (cycles_end));
end = clock();
avgrec2.time += (double)(end - start) / CLOCKS_PER_SEC;
if((cycles_end - cycles_start) < limit)
avgrec2.cycle += (cycles_end - cycles_start);
// printf("new ver.==============================================================================================\n");
// printf("time : %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
// printf("CPU cycles : %llu cycles\n", cycles_end - cycles_start);
for (unsigned long i = 0; i < bound; ++i) {
if(r2[i] != r1[i]){
printf("Different in %ld, %lu vs %lu\n", i, r1[i], r2[i]);
}
}
free(bit_array);
}
// printf("original ver.=========================================================================================\n");
// printf("Average time : %f seconds\n", avgrec1.time/(test_t*idx));
// printf("Average CPU cycles : %lu cycles\n", avgrec1.cycle/(test_t*idx));
// printf("new ver.==============================================================================================\n");
// printf("Average time : %f seconds\n", avgrec2.time/(test_t*idx));
// printf("Average CPU cycles : %lu cycles\n", avgrec2.cycle/(test_t*idx));
}
printf("original ver.=========================================================================================\n");
printf("Average time : %f seconds\n", avgrec1.time/(test_t*64));
printf("Average CPU cycles : %lu cycles\n", avgrec1.cycle/(test_t*64));
printf("new ver.==============================================================================================\n");
printf("Average time : %f seconds\n", avgrec2.time/(test_t*64));
printf("Average CPU cycles : %lu cycles\n", avgrec2.cycle/(test_t*64));
return EXIT_SUCCESS;
}
```
:::
其中:
`test_t` 為測試輪數,即尋找 `n-th set bit` 多少次
`idx` 為 `n-th set bit`
`ARRAY_SIZE` 為一連續記憶體空間,因此每一輪中皆會送入 `1000000/64` 個 `long` 長度的 `word` 去尋找 `n-th set bit`
測試過程:
1. 配置長度為 `1000000` 的記憶體空間並隨機填入 `1` 或 `0`
2. 將每一個 `long` (這裡為`64` bits) 長度的記憶體位置傳入 `fns` 作計算,同時為了公平,兩比較函式的每一輪 (`1000000/64`) 輸入都相同,並為了更好找到平均值,每一次尋找的 set bits 皆尋找 200 次,而 set bits 為能更全面,範圍為 0~63 。
3. 共執行 (`1000000/64`)\*`64`\*`200` 次
4. 同時為避免快但錯誤,在得到結果後會將兩者輸出做比較。
## 結果(每次可能不太相同,但仍有明顯差異,不開啟 O2 選項編譯的情況):
```
皆為尋找 長度 1000000 的記憶體位置 每一個 word 內的 n-th set bit
測量 32 bit
original ver.============================
Average time : 0.005153 seconds
Average CPU cycles : 18478354 cycles
new ver.=================================
Average time : 0.000790 seconds
Average CPU cycles : 2843377 cycles
測量 64 bit
original ver.============================
Average time : 0.007763 seconds
Average CPU cycles : 27781828 cycles
new ver.=================================
Average time : 0.000729 seconds
Average CPU cycles : 2620535 cycles
```
最後輸出正確
可以看到我改寫的版本在平均花費時間 (6.52x faster、10.64x faster) 與 cycle 數 (6.5x less、10.60x less) 皆比 [linux/include/linux/bitops.h](https://github.com/torvalds/linux/blob/8f2c057754b25075aa3da132cd4fd4478cdab854/include/linux/bitops.h#L255) 裡的 `fns` 還要更佳
且也證明了 `n` 越大,差距越大,這裡因為是做 `0 ~ 31` 及 `0 ~ 63` ,所以會被平均,不過仍可以看到 `n` 增大會使平均時間增加。
而 CPU cycles 會降低我認為是 branch miss 次數降低,隨著 `n` 加大,代表需要找的 set bits 更多,在我的版本中會更容易進入判斷式 (只要 `bits <= n` ,因為 `n` 加大) ,不過 Linux 核心內的版本就沒那麼幸運,因為他只是要找最低位的 set bit 位置,而與 `n` 的設定沒有關係,因此 branch prediction 很可能猜錯,透過結果也可以猜出 branch 大致上以預測為進入判斷式為主。
我的程式碼的優勢:
1. 更快的執行時間,在 n 更大時,會有更大的差距
2. 使用更少的判斷式
3. 無需呼叫外部函示
因為並沒有實驗 branch prediction miss 的次數,因此不提到。
## 使用 `perf` 做比較
目前可以看到 branch-misses 是不支援的,[原因](https://stackoverflow.com/questions/22712956/why-does-perf-stat-show-stalled-cycles-backend-as-not-supported)似乎出自 CPU,等待我在其他裝置上嘗試
```
Performance counter stats for './fns':
<not supported> cycles
326980.52 msec cpu-clock # 1.000 CPUs utilized
326949836000 ns user_time # 999.906 M/sec
7998000 ns system_time # 24.460 K/sec
326981.93 msec task-clock # 1.000 CPUs utilized
327019524641 ns duration_time # 1.000 G/sec
<not supported> instructions
<not supported> branches
<not supported> branch-misses
327.019524641 seconds time elapsed
326.949836000 seconds user
0.007998000 seconds sys
```
使用 `$ perf record -e cycles ./fns`
```
Samples: 1M of event 'cpu-clock:uH', Event count (approx.): 393692500000
Overhead Com Shared Object Symbol
31.54% fns fns [.] main
27.05% fns fns [.] __ffs
10.55% fns libc.so.6 [.] __random
8.62% fns libc.so.6 [.] __random_r
3.98% fns libc.so.6 [.] __aarch64_swp4_rel
3.72% fns libc.so.6 [.] __aarch64_cas4_acq
3.52% fns fns [.] fns2
3.21% fns fns [.] fns
2.89% fns fns [.] rand@plt
2.53% fns fns [.] __clear_bit
```
可以看到我的版本 `fns2` 較使用 `__ffs` 及 `__clear_bit` 的 `fns` ,使用更少的 `cpu-clock`
原圖來看,原始 fns branch prediction miss 的次數佔比非常高
![截圖 2024-04-15 下午2.48.15](https://hackmd.io/_uploads/Hksnar5lC.png)
而我的版本在佔比最高的地方也並非是 branch prediction
![截圖 2024-04-15 下午2.53.19](https://hackmd.io/_uploads/SkCk18cx0.png)
## 問題:
1. 我留意到你提供的 online GDB 連結中並沒有添加任何編譯參數,但是 Linux 核心通常會在開啟 O2 選項的情況下編譯,這很可能會大幅影響所測量到的數值,不確定你測試時所採用的編譯參數為何?
起初並未知道此問題,因此僅以 `$ gcc -c fns.c -o fns.o` 進行編譯
補充實驗
使用以下編譯指令,並使用虛擬機器 ubuntu 22.04 執行
```
$ gcc -c -O2 fns.c -o fns.o
$ gcc fns.o -o fns
$ ./fns
```
```
original ver.==========================================
Average time : 0.002114 seconds
Average time CPU cycles : 7620518 cycles
new ver.===============================================
Average time : 0.000349 seconds
Average time CPU cycles : 1267415 cycles
```
可以看到效能差異減小至平均花費時間 (6.05x faster) 與 cycle 數 (6.01x less) ,為原始數據約 60% ,不過還是表現較佳。
改以原生 Linux 環境執行
```
original ver.==========================================
Average time : 0.002162 seconds
Average time CPU cycles : 4301711 cycles
new ver.===============================================
Average time : 0.000343 seconds
Average time CPU cycles : 682152 cycles
```
可以看到效能差異不大但還是有提升 (6.30x faster) 與 cycle 數 (6.30x less) ,不過 cycle 數大幅減少。
2. 有些硬體有支援 ffs 操作,但有些硬體沒有支援所以採用軟體實作,但我目前沒看到針對軟體/硬體實作分別的測試結果。
想問是指 ffs 有分為軟體實作,如下
```c
static inline unsigned long __ffs(unsigned long word)
{
int num = 0;
#if BITS_PER_LONG == 64
if (!(word & 0xffffffff)) {
num += 32;
word >>= 32;
}
#endif
...
}
```
及硬體實作,如下:
```c
static inline unsigned long __ffs(unsigned long x)
{
int ret;
__asm__ ("l.ff1 %0,%1"
: "=r" (ret)
: "r" (x));
return ret-1;
}
```
而硬體實作是以 inline assembly 實作,需要比較我的版本與硬體實作版本的差異嗎?
目前使用虛擬機器及原生 Linux 環境皆會出現以下錯誤
```
fns.c: Assembler messages:
fns.c:58: Error: no such instruction: `l.ff1 %ecx,%ecx'
fns.c:58: Error: no such instruction: `l.ff1 %ecx,%ecx'
```
然而我無論如何都查不到 `l.ff1 %0,%1` 的 assembly instruction。
以下為 `__ffs` 的 inline,
```c
#ifdef CONFIG_OPENRISC_HAVE_INST_FF1
static inline unsigned long __ffs(unsigned long x)
{
int ret;
__asm__ ("l.ff1 %0,%1"
: "=r" (ret)
: "r" (x));
return ret-1;
}
#else
#include <asm-generic/bitops/__ffs.h>
#endif
```
不過就找不到定義 `CONFIG_OPENRISC_HAVE_INST_FF1` 的地方了
## 最後結果:
若 `__ffs` 為使用 builtin funtion ,那麼結果反而會較差,我後來認為是即使判斷式較少,但因計算 1 的數量的程式碼為不斷地將 1 左移並做 & 運算,因此 cycle 數反而超過了少量判斷式省下來的 cycle 數而提升。因此邱冠維同學提出了更好的方法,經過與 linux 核心開發者及另一位 bitmap 的 reviewer 討論過後,目前可能的程式碼為
```c
static inline unsigned long fns(unsigned long word, unsigned int n)
{
while (word && n--)
word &= word - 1;
return word ? __ffs(word) : BITS_PER_LONG;
}
```
此方法利用了 `Brian Kernighan’s Algorithm` ,可以僅做一次 `word &= word - 1` 就達成原程式碼的 clear bit ,假如一 word 為 `0b1000000` ,那麼僅須一次 `word &= word - 1` 即可以等同清除一位 set bit 而無須先透過 `__ffs` 計算最低位 set bit 並作清除的動作。但我的方法須使用以下程式碼做計算並得出 一定範圍內的 set bit ,而且會有**重複計算相同範圍的問題**(假設 32 位內的 set bit 過多,那麼不會進入判斷式,且在下一步會計算 16 位內的 set bit ,可以發現其實是會重複算最後的範圍)
```c
#define __const_hweight8(w) \
((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))) + \
(!!((word) & (1ULL << 2))) + (!!((word) & (1ULL << 3)))) + \
(!!((word) & (1ULL << 4))) + (!!((word) & (1ULL << 5)))) + \
(!!((word) & (1ULL << 6))) + (!!((word) & (1ULL << 7))))
```
想當然前者會更為高效,不過當 n 越大時 (且要有足夠的 n ),會有相反的結果,以下為結果,其中 original 為邱同學的程式碼,而 new 則為我的:
```
Test n: 0
original ver.==========================================
Average time : 0.000021 seconds
Average time CPU cycles : 68181 cycles
new ver.===============================================
Average time : 0.000201 seconds
Average time CPU cycles : 720986 cycles
Test n: 1
original ver.==========================================
Average time : 0.000023 seconds
Average time CPU cycles : 75810 cycles
new ver.===============================================
Average time : 0.000238 seconds
Average time CPU cycles : 855627 cycles
Test n: 2
original ver.==========================================
Average time : 0.000026 seconds
Average time CPU cycles : 87842 cycles
new ver.===============================================
Average time : 0.000254 seconds
Average time CPU cycles : 913202 cycles
Test n: 3
original ver.==========================================
Average time : 0.000031 seconds
Average time CPU cycles : 103377 cycles
new ver.===============================================
Average time : 0.000275 seconds
Average time CPU cycles : 987046 cycles
Test n: 4
original ver.==========================================
Average time : 0.000032 seconds
Average time CPU cycles : 110543 cycles
new ver.===============================================
Average time : 0.000274 seconds
Average time CPU cycles : 983954 cycles
Test n: 5
original ver.==========================================
Average time : 0.000042 seconds
Average time CPU cycles : 144670 cycles
new ver.===============================================
Average time : 0.000280 seconds
Average time CPU cycles : 1004897 cycles
Test n: 6
original ver.==========================================
Average time : 0.000041 seconds
Average time CPU cycles : 141425 cycles
new ver.===============================================
Average time : 0.000295 seconds
Average time CPU cycles : 1061731 cycles
Test n: 7
original ver.==========================================
Average time : 0.000046 seconds
Average time CPU cycles : 161445 cycles
new ver.===============================================
Average time : 0.000308 seconds
Average time CPU cycles : 1107051 cycles
Test n: 8
original ver.==========================================
Average time : 0.000052 seconds
Average time CPU cycles : 179576 cycles
new ver.===============================================
Average time : 0.000313 seconds
Average time CPU cycles : 1121208 cycles
Test n: 9
original ver.==========================================
Average time : 0.000055 seconds
Average time CPU cycles : 193132 cycles
new ver.===============================================
Average time : 0.000315 seconds
Average time CPU cycles : 1130842 cycles
Test n: 10
original ver.==========================================
Average time : 0.000060 seconds
Average time CPU cycles : 210818 cycles
new ver.===============================================
Average time : 0.000324 seconds
Average time CPU cycles : 1164863 cycles
Test n: 11
original ver.==========================================
Average time : 0.000065 seconds
Average time CPU cycles : 228608 cycles
new ver.===============================================
Average time : 0.000336 seconds
Average time CPU cycles : 1206329 cycles
Test n: 12
original ver.==========================================
Average time : 0.000071 seconds
Average time CPU cycles : 251015 cycles
new ver.===============================================
Average time : 0.000350 seconds
Average time CPU cycles : 1257303 cycles
Test n: 13
original ver.==========================================
Average time : 0.000076 seconds
Average time CPU cycles : 266143 cycles
new ver.===============================================
Average time : 0.000353 seconds
Average time CPU cycles : 1267077 cycles
Test n: 14
original ver.==========================================
Average time : 0.000081 seconds
Average time CPU cycles : 284067 cycles
new ver.===============================================
Average time : 0.000362 seconds
Average time CPU cycles : 1298368 cycles
Test n: 15
original ver.==========================================
Average time : 0.000087 seconds
Average time CPU cycles : 306249 cycles
new ver.===============================================
Average time : 0.000376 seconds
Average time CPU cycles : 1351340 cycles
Test n: 16
original ver.==========================================
Average time : 0.000093 seconds
Average time CPU cycles : 328772 cycles
new ver.===============================================
Average time : 0.000378 seconds
Average time CPU cycles : 1359326 cycles
Test n: 17
original ver.==========================================
Average time : 0.000098 seconds
Average time CPU cycles : 345717 cycles
new ver.===============================================
Average time : 0.000370 seconds
Average time CPU cycles : 1329794 cycles
Test n: 18
original ver.==========================================
Average time : 0.000106 seconds
Average time CPU cycles : 373836 cycles
new ver.===============================================
Average time : 0.000380 seconds
Average time CPU cycles : 1363232 cycles
Test n: 19
original ver.==========================================
Average time : 0.000109 seconds
Average time CPU cycles : 385798 cycles
new ver.===============================================
Average time : 0.000382 seconds
Average time CPU cycles : 1372295 cycles
Test n: 20
original ver.==========================================
Average time : 0.000114 seconds
Average time CPU cycles : 404843 cycles
new ver.===============================================
Average time : 0.000382 seconds
Average time CPU cycles : 1371155 cycles
Test n: 21
original ver.==========================================
Average time : 0.000124 seconds
Average time CPU cycles : 440045 cycles
new ver.===============================================
Average time : 0.000397 seconds
Average time CPU cycles : 1426729 cycles
Test n: 22
original ver.==========================================
Average time : 0.000128 seconds
Average time CPU cycles : 453378 cycles
new ver.===============================================
Average time : 0.000395 seconds
Average time CPU cycles : 1419189 cycles
Test n: 23
original ver.==========================================
Average time : 0.000134 seconds
Average time CPU cycles : 475297 cycles
new ver.===============================================
Average time : 0.000416 seconds
Average time CPU cycles : 1493521 cycles
Test n: 24
original ver.==========================================
Average time : 0.000144 seconds
Average time CPU cycles : 509954 cycles
new ver.===============================================
Average time : 0.000421 seconds
Average time CPU cycles : 1511201 cycles
Test n: 25
original ver.==========================================
Average time : 0.000151 seconds
Average time CPU cycles : 536158 cycles
new ver.===============================================
Average time : 0.000419 seconds
Average time CPU cycles : 1491608 cycles
Test n: 26
original ver.==========================================
Average time : 0.000166 seconds
Average time CPU cycles : 591555 cycles
new ver.===============================================
Average time : 0.000429 seconds
Average time CPU cycles : 1539124 cycles
Test n: 27
original ver.==========================================
Average time : 0.000176 seconds
Average time CPU cycles : 627527 cycles
new ver.===============================================
Average time : 0.000424 seconds
Average time CPU cycles : 1523694 cycles
Test n: 28
original ver.==========================================
Average time : 0.000193 seconds
Average time CPU cycles : 687868 cycles
new ver.===============================================
Average time : 0.000417 seconds
Average time CPU cycles : 1496883 cycles
Test n: 29
original ver.==========================================
Average time : 0.000215 seconds
Average time CPU cycles : 768329 cycles
new ver.===============================================
Average time : 0.000410 seconds
Average time CPU cycles : 1473115 cycles
Test n: 30
original ver.==========================================
Average time : 0.000250 seconds
Average time CPU cycles : 891316 cycles
new ver.===============================================
Average time : 0.000407 seconds
Average time CPU cycles : 1460716 cycles
Test n: 31
original ver.==========================================
Average time : 0.000269 seconds
Average time CPU cycles : 963115 cycles
new ver.===============================================
Average time : 0.000387 seconds
Average time CPU cycles : 1389422 cycles
Test n: 32
original ver.==========================================
Average time : 0.000289 seconds
Average time CPU cycles : 1032171 cycles
new ver.===============================================
Average time : 0.000370 seconds
Average time CPU cycles : 1329986 cycles
Test n: 33
original ver.==========================================
Average time : 0.000305 seconds
Average time CPU cycles : 1091329 cycles
new ver.===============================================
Average time : 0.000353 seconds
Average time CPU cycles : 1268201 cycles
Test n: 34
original ver.==========================================
Average time : 0.000325 seconds
Average time CPU cycles : 1163132 cycles
new ver.===============================================
Average time : 0.000338 seconds
Average time CPU cycles : 1213680 cycles
Test n: 35
original ver.==========================================
Average time : 0.000346 seconds
Average time CPU cycles : 1239294 cycles
new ver.===============================================
Average time : 0.000321 seconds
Average time CPU cycles : 1153416 cycles
Test n: 36
original ver.==========================================
Average time : 0.000356 seconds
Average time CPU cycles : 1274277 cycles
new ver.===============================================
Average time : 0.000308 seconds
Average time CPU cycles : 1106007 cycles
Test n: 37
original ver.==========================================
Average time : 0.000362 seconds
Average time CPU cycles : 1298737 cycles
new ver.===============================================
Average time : 0.000295 seconds
Average time CPU cycles : 1060095 cycles
Test n: 38
original ver.==========================================
Average time : 0.000366 seconds
Average time CPU cycles : 1311189 cycles
new ver.===============================================
Average time : 0.000284 seconds
Average time CPU cycles : 1020473 cycles
Test n: 39
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1324439 cycles
new ver.===============================================
Average time : 0.000281 seconds
Average time CPU cycles : 1009790 cycles
Test n: 40
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1340945 cycles
new ver.===============================================
Average time : 0.000293 seconds
Average time CPU cycles : 1050948 cycles
Test n: 41
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1334657 cycles
new ver.===============================================
Average time : 0.000279 seconds
Average time CPU cycles : 1002511 cycles
Test n: 42
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1340435 cycles
new ver.===============================================
Average time : 0.000280 seconds
Average time CPU cycles : 1005596 cycles
Test n: 43
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1324166 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 992143 cycles
Test n: 44
original ver.==========================================
Average time : 0.000369 seconds
Average time CPU cycles : 1322804 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 1002429 cycles
Test n: 45
original ver.==========================================
Average time : 0.000375 seconds
Average time CPU cycles : 1340341 cycles
new ver.===============================================
Average time : 0.000283 seconds
Average time CPU cycles : 1015056 cycles
Test n: 46
original ver.==========================================
Average time : 0.000367 seconds
Average time CPU cycles : 1313704 cycles
new ver.===============================================
Average time : 0.000275 seconds
Average time CPU cycles : 996220 cycles
Test n: 47
original ver.==========================================
Average time : 0.000368 seconds
Average time CPU cycles : 1317041 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 994093 cycles
Test n: 48
original ver.==========================================
Average time : 0.000365 seconds
Average time CPU cycles : 1309510 cycles
new ver.===============================================
Average time : 0.000273 seconds
Average time CPU cycles : 980609 cycles
Test n: 49
original ver.==========================================
Average time : 0.000368 seconds
Average time CPU cycles : 1318358 cycles
new ver.===============================================
Average time : 0.000274 seconds
Average time CPU cycles : 984556 cycles
Test n: 50
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1326410 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 982680 cycles
Test n: 51
original ver.==========================================
Average time : 0.000366 seconds
Average time CPU cycles : 1312605 cycles
new ver.===============================================
Average time : 0.000273 seconds
Average time CPU cycles : 978370 cycles
Test n: 52
original ver.==========================================
Average time : 0.000368 seconds
Average time CPU cycles : 1316405 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 995060 cycles
Test n: 53
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1324702 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 994500 cycles
Test n: 54
original ver.==========================================
Average time : 0.000372 seconds
Average time CPU cycles : 1333373 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 990131 cycles
Test n: 55
original ver.==========================================
Average time : 0.000369 seconds
Average time CPU cycles : 1323664 cycles
new ver.===============================================
Average time : 0.000281 seconds
Average time CPU cycles : 1007539 cycles
Test n: 56
original ver.==========================================
Average time : 0.000375 seconds
Average time CPU cycles : 1340914 cycles
new ver.==============================================
Average time : 0.000278 seconds
Average time CPU cycles : 996436 cycles
Test n: 57
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1338532 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 994950 cycles
Test n: 58
original ver.==========================================
Average time : 0.000376 seconds
Average time CPU cycles : 1344455 cycles
new ver.===============================================
Average time : 0.000278 seconds
Average time CPU cycles : 997226 cycles
Test n: 59
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1334166 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 979184 cycles
Test n: 60
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1336970 cycles
new ver.===============================================
Average time : 0.000280 seconds
Average time CPU cycles : 1006646 cycles
Test n: 61
original ver.==========================================
Average time : 0.000375 seconds
Average time CPU cycles : 1341625 cycles
new ver.===============================================
Average time : 0.000285 seconds
Average time CPU cycles : 1026516 cycles
Test n: 62
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1333731 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 993799 cycles
Test n: 63
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1340000 cycles
new ver.===============================================
Average time : 0.000286 seconds
Average time CPU cycles : 1025193 cycles
```
經由以上實驗,可以看到在 n 為 35 時,我的程式碼會開始達到較好的效能,不過因為我的程式碼計算過程非常固定,因此可以達到較為常數時間的執行時間,不會有太大幅度的變動,但邱同學的程式碼會與 n 的值成正比,最後會緩和則是因為我是採以隨機填入 1 或 0 的方式, set bits 的期望值應會約為 32 ,因此邱同學的版本花費時間未再上升。
雖然我的程式碼在 n 偏大時表現較佳,不過與版本相比在 n 篇小時會較差是最大的問題。
## 心得
透過這次 Jserv 老師與邱同學的討論對我有莫大的幫助,包括改進需做嚴謹的實驗是必須的,且參考網路上現有的程式碼是必要的,雖然之前在課堂作業有看過 Brian Kernighan’s Algorithm 的使用,此次改進時卻完全沒想到,但其實一上網搜尋 count set bits 就可以看到這個演算法,我甚至認為少量的判斷式就是最佳的,不過透過結果可以看到這是不成立的。這次我也深知自己能力的不足,學到如何向他人表達你的程式碼之所以更好,並能與他人進行有效率的溝通。
<!-- 相關與 Linux 核心開發者討論 [這裡](https://mail.google.com/mail/u/1/?ik=75069e04e5&view=pt&search=all&permthid=thread-f:1797464826894404805&simpl=msg-f:1797464826894404805&simpl=msg-f:1797595341939973051&simpl=msg-f:1797655536150274489&simpl=msg-f:1797683603239017549&simpl=msg-f:1797684187377995497&simpl=msg-f:1797687063530644928&simpl=msg-f:1797687553500865377&simpl=msg-f:1797689577552236236) -->
最後謝謝 Jserv 及 邱冠維同學
## lookup table 及其他版本
```c
unsigned int lut[16][4] = {{4, 4, 4, 4}, {16, 4, 4, 4}, {17, 4, 4, 4}, {32, 1, 4, 4}, \
{18, 4, 4, 4}, {32, 2, 4, 4}, {33, 2, 4, 4}, {48, 1, 2, 4}, \
{19, 4, 4, 4}, {32, 3, 4, 4}, {33, 3, 4, 4}, {48, 1, 3, 4}, \
{34, 3, 4, 4}, {48, 2, 3, 4}, {49, 2, 3, 4}, {64, 1, 2, 3}};
static inline unsigned int hweight4(uint8_t word)
{
return lut[word & 0xf][0] >> 4;
}
static inline unsigned long fns4(uint8_t word, unsigned int n)
{
return lut[word & 0xf][n] & 0xf;
}
static inline unsigned long fns8(uint8_t word, unsigned int n)
{
unsigned int w = hweight4(word);
return n < w ? fns4(word, n) : 4 + fns4(word >> 4, n - w);
}
static inline unsigned long fns16(uint16_t word, unsigned int n)
{
unsigned int w = __const_hweight8((uint8_t)word);
return n < w ? fns8((uint8_t)word, n) : 8 + fns8((uint8_t)(word >> 8), n - w);
}
static inline unsigned long fns32(uint32_t word, unsigned int n)
{
unsigned int w = __const_hweight16((uint16_t)word);
return n < w ? fns16((uint16_t)word, n) : 16 + fns16((uint16_t)(word >> 16), n - w);
}
static inline unsigned long fns64(uint64_t word, unsigned int n)
{
unsigned int w = __const_hweight32((uint32_t)word);
return n < w ? fns32((uint32_t)word, n) : 32 + fns32((uint32_t)(word >> 32), n - w);
}
/**
* fns - find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
* @n: Bit to find, counting from 0
*
* Returns N'th set bit. If no such bit found, returns >= BITS_PER_LONG
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
#if BITS_PER_LONG == 64
return fns64(word, n);
#else
return fns32(word, n);
#endif
}
```
Lookup table 版本1 不使用前面就計算過的版本
```c
satic inline unsigned long fns(unsigned long word, unsigned int n)
{
unsigned int pos = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
// find 32 bits
bits = __const_hweight32(word);
if (bits <= n) {
n -= bits;
word >>= 32;
pos += 32;
}
#endif
bits = __const_hweight16(word);
if (bits <= n) {
n -= bits;
word >>= 16;
pos += 16;
}
bits = __const_hweight8(word);
if (bits <= n) {
n -= bits;
word >>= 8;
pos += 8;
}
return n<8? fbs_table[word & 0xff][n] + pos: BITS_PER_LONG+1;
}
```
Lookup table 版本2 使用前面計算過的版本
```c
static inline unsigned long fns(unsigned long word, unsigned int n)
{
unsigned int pos = 0;
unsigned int bits_higher_higher = 0;
unsigned int bits_higher = 0;
unsigned int bits_lower = 0;
unsigned int bits_lower_lower = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
// find 32 bits
bits_higher_higher = __const_hweight8(word>>24);
bits_higher = __const_hweight8(word>>16);
bits_lower = __const_hweight8(word>>8);
bits_lower_lower = __const_hweight8(word);
bits = bits_higher_higher+bits_higher+bits_lower+bits_lower_lower;
if (bits <= n)
n -= bits;
word >>= 32;
pos += 32;
// update 16 bits
bits_lower = __const_hweight8(word>>8);
bits_lower_lower = __const_hweight8(word);
}
#else
// find 16 bits
bits_lower = __const_hweigh8(word>>8);
bits_lower_lower = __const_hweight8(word);
#endif
bits = bits_lower+bits_lower_lower;
if (bits <= n) {
n -= bits;
word >>= 16;
pos += 16;
bits_lower_lower = __const_hweight8(word);
}
bits = bits_lower_lower;
if (bits <= n) {
n -= bits;
word >>= 8;
pos += 8;
}
return n<8? fbs_table[word & 0xff][n] + pos: BITS_PER_LONG+1;
}
```
binary search 版本 1 不使用前面就計算過的版本
```c
static inline unsigned long fns(unsigned long word, unsigned int n)
{
unsigned int sum = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
bits = __const_hweight32(word);
if (bits <= n) {
n -= bits;
word >>= 32;
sum += 32;
}
#endif
bits = __const_hweight16(word);
if (bits <= n) {
n -= bits;
word >>= 16;
sum += 16;
}
bits = __const_hweight8(word);
if (bits <= n) {
n -= bits;
word >>= 8;
sum += 8;
}
bits = __const_hweight4(word);
if (bits <= n) {
n -= bits;
word >>= 4;
sum += 4;
}
bits = __const_hweight2(word);
if (bits <= n) {
n -= bits;
word >>= 2;
sum += 2;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
word >>= 1;
sum += 1;
}
bits = (unsigned int) (word & 0x1);
if (bits <= n) {
n -= bits;
sum += 1;
}
if(!n)
return sum;
return BITS_PER_LONG+1;
}
```
binary search 版本 2 使用前面計算過的版本
```c
static inline unsigned long fns(unsigned long word, unsigned int n)
{
unsigned int sum = 0;
unsigned int bits_higher = 0;
unsigned int bits_lower = 0;
unsigned int bits = 0;
#if BITS_PER_LONG == 64
// find 32 bits
bits_higher = __const_hweight16(word>>16);
bits_lower = __const_hweight16(word);
bits = bits_higher+bits_lower;
if (bits <= n) {
n -= bits;
word >>= 32;
sum += 32;
// update 16 bits
bits_lower = __const_hweight16(word);
}
#else
// find 16 bits
bits_lower = __const_hweight16(word)
#endif
if (bits_lower <= n) {
n -= bits_lower;
word >>= 16;
sum += 16;
}
// find 8 bits
bits_higher = __const_hweight4(word>>4);
bits_lower = __const_hweight4(word);
bits = bits_higher+bits_lower;
if (bits <= n) {
n -= bits;
word >>= 8;
sum += 8;
// update 4 bits
bits_lower = __const_hweight4(word);
}
if (bits_lower <= n) {
n -= bits_lower;
word >>= 4;
sum += 4;
}
// find 2 bits
bits_higher = (unsigned int) ((word>>1) & 0x1); // 0
bits_lower = (unsigned int) (word & 0x1); // 1
bits = bits_higher+bits_lower; // 1
if (bits <= n) {
n -= bits;
word >>= 2;
sum += 2;
// update 1 bit
bits_lower = (unsigned int) (word & 0x1);
}
if (bits_lower <= n) {
n -= bits_lower;
word >>= 1;
sum += 1;
// update 1 bit
bits_lower = (unsigned int) (word & 0x1);
}
if (bits_lower <= n) {
n -= bits_lower;
sum += 1;
}
if(!n)
return sum;
return BITS_PER_LONG+1;
}
```