FNS 改進

fns 函式

函式定義:
find N'th set bit in a word ,即在一 word 長度中的記憶體尋找第 n 個 set bits
函式作法:

  1. 利用 __ffs 尋找低位開始第一個 set bit 之位置
  2. 若此時的 n 不為 0 ,則利用 __clear_bit 清除該位 (設為 0) ,再回到 1.

原程式碼:

static inline void __clear_bit(unsigned long nr, volatile unsigned long *addr)
{
    unsigned long mask = BIT_MASK(nr);
    unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);

    *p &= ~mask;
}

/* find N'th set bit in a word
 * @word: The word to search
 * @n: Bit to find
 */
static inline unsigned long fns(unsigned long word, unsigned int n)
{
    while (word) {
        unsigned int bit = __ffs(word);
        if (n-- == 0)
            return bit;
        __clear_bit(bit, &word);
    }

    return BITS_PER_LONG;
}

改成:
需要注意的是,我加入了原先程式碼沒有的 macro (原本只到 __const_hweight8 ),只是為了美觀,如下:

#define __const_hweight2(w)                                                 \
    ((unsigned int) (!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))))

#define __const_hweight4(w)                                                 \
    ((unsigned int) (!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))) + \
                     (!!((w) & (1ULL << 2))) + (!!((w) & (1ULL << 3))))
/* find N'th set bit in a word
 * @word: The word to search
 * @n: Bit to find
 */
static inline unsigned long fns(unsigned long word, unsigned int n)
{
    unsigned int sum = 0;
    unsigned int bits = 0;
    
#if BITS_PER_LONG == 64
    bits = __const_hweight32(word);
    if (bits <= n) {
        n -= bits;
        word >>= 32;
        sum += 32;
    }
#endif

    bits = __const_hweight16(word);
    if (bits <= n) {
        n -= bits;
        word >>= 16;
        sum += 16;
    }
    
    bits = __const_hweight8(word);
    if (bits <= n) {
        n -= bits;
        word >>= 8;
        sum += 8;
    }

    bits = __const_hweight4(word);
    if (bits <= n) {
        n -= bits;
        word >>= 4;
        sum += 4;
    }

    bits = __const_hweight2(word);
    if (bits <= n) {
        n -= bits;
        word >>= 2;
        sum += 2;
    }
    
    bits = (unsigned int) (word & 0x1);
    if (bits <= n) {
        n -= bits;
        word >>= 1;
        sum += 1;
    }
    
    bits = (unsigned int) (word & 0x1);
    if (bits <= n) {
        n -= bits;
        sum += 1;
    }
    
    if(!n)
        return sum;

    return BITS_PER_LONG;
}

函式執行過程解釋

n>0 時,以下 block 為尋找 nset bit ,不過事實上需找 n+1set bit ,因此當 n=0 時,代表他還需要找一個,此時以下 block 則變為尋找全為 0 的 bits ,因此最後會抵達第 n+1set bit 的右一位,但因為 sum 為從 0 開始往上加,因此會剛好等於 n-th set bit 的位置。

bits = __const_hweightC(word & const);
if (bits <= n) {
    n -= bits;
    sum += const;
}
假設需找 7-th set bit
word =
0x1010010101011011
  8 7  6 5 4 32 10
以上為 set bit 的對應 index ,可以看到 7-th set bit 對應位置為 13 (0 起始)
而 __const_hweightC 會回傳範圍 C 有多少個 set bits 

此時 n > 0 
====block 為尋找 n 個 set bit=====
C = 16 回傳 9 > n => 不進入判斷式 
C = 8 回傳 5 <= n => n-=5 sum+=8 
word >>= 8 
0x10100101
  3 2  1 0
C = 4 回傳 2 <= n => n-=2 sum+=4
word >>= 4 
0x1010
  1 0 

此時 n = 0
====block 為尋找全為 0 的 bits=====
C = 2 回傳 1 > n => 不進入判斷式
C = 1 回傳 0 <= n => n-=0 sum+=1
word >>= 1
0x101
  1 0
C = 1 回傳 1 > n => 不進入判斷式
0x101
  1 0

結束 
n=0 sum=13

可以看到此時 wordlsbn-th set bit ,且右移了 13 位,即為 n-th set bitindex

比較:
原式: n+1ffsnclear bit ,而 ffs 使用 log(BITS_PER_LONG) 次判斷式,而 clear bit 為常數操作,因此最後會有 (n+1)*log(BITS_PER_LONG) + (n + 1)(外部 while 迴圈) 次判斷式。
我的版本: log(BITS_PER_LONG) + 1 (再做一次 & 0x1 去完整檢查 64 bit 的範圍) + 1 次判斷式
因此 n 越大,差距應會越明顯

測試

以下為測試程式碼:
可在這裡直接執行 OnlineGDB

測試程式碼
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

/* Assume 64-bit architecture */
#define BITS_PER_LONG 64
#define ARRAY_SIZE 1000000 
#define BITS_PER_BYTE 8

#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define DIV_ROUND_UP(n, d) (((n) + (d) -1) / (d))
#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long))

#define __const_hweight8(w)                                              \
    ((unsigned int) ((!!((w) & (1ULL << 0))) + (!!((w) & (1ULL << 1))) + \
                     (!!((w) & (1ULL << 2))) + (!!((w) & (1ULL << 3))) + \
                     (!!((w) & (1ULL << 4))) + (!!((w) & (1ULL << 5))) + \
                     (!!((w) & (1ULL << 6))) + (!!((w) & (1ULL << 7)))))

#define __const_hweight16(w) (__const_hweight8(w) + __const_hweight8((w) >> 8))
#define __const_hweight32(w) \
    (__const_hweight16(w) + __const_hweight16((w) >> 16))
#define __const_hweight64(w) \
    (__const_hweight32(w) + __const_hweight32((w) >> 32))

static inline unsigned long hweight_long(unsigned long w)
{
    return __const_hweight64(w);
}

/* find first bit in word.
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static inline unsigned long __ffs(unsigned long word)
{
    int num = 0;
    
#if BITS_PER_LONG == 64
    if (!(word & 0xffffffff)) {
        num += 32;
        word >>= 32;
    }
#endif

    if (!(word & 0xffff)) {
        num += 16;
        word >>= 16;
    }
    if (!(word & 0xff)) {
        num += 8;
        word >>= 8;
    }
    if (!(word & 0xf)) {
        num += 4;
        word >>= 4;
    }
    if (!(word & 0x3)) {
        num += 2;
        word >>= 2;
    }
    if ((word & 0x1) == 0)
        num += 1;
        
    return num;
}

static inline void __clear_bit(unsigned long nr, volatile unsigned long *addr)
{
    unsigned long mask = BIT_MASK(nr);
    unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);

    *p &= ~mask;
}

/* find N'th set bit in a word
 * @word: The word to search
 * @n: Bit to find
 */
static inline unsigned long fns(unsigned long word, unsigned int n)
{
    while (word) {
        unsigned int bit = __ffs(word);
        if (n-- == 0)
            return bit;
        __clear_bit(bit, &word);
    }

    return BITS_PER_LONG;
}

#define __const_hweight2(w)                                                 \
    ((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))))

#define __const_hweight4(w)                                                 \
    ((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))) + \
                     (!!((word) & (1ULL << 2))) + (!!((word) & (1ULL << 3))))
/* find N'th set bit in a word
 * @word: The word to search
 * @n: Bit to find
 */
static inline unsigned long fns2(unsigned long word, unsigned int n)
{
    unsigned long sum = 0;
    unsigned int bits = 0;
    
#if BITS_PER_LONG == 64
    bits = __const_hweight32(word & 0xffffffff);
    if (bits <= n) {
        n -= bits;
        word >>= 32;
        sum += 32;
    }
#endif

    bits = __const_hweight16(word & 0xffff);
    if (bits <= n) {
        n -= bits;
        word >>= 16;
        sum += 16;
    }
    
    bits = __const_hweight8(word & 0xff);
    if (bits <= n) {
        n -= bits;
        word >>= 8;
        sum += 8;
    }

    bits = __const_hweight4(word & 0xf);
    if (bits <= n) {
        n -= bits;
        word >>= 4;
        sum += 4;
    }

    bits = __const_hweight2(word & 0x3);
    if (bits <= n) {
        n -= bits;
        word >>= 2;
        sum += 2;
    }
    
    bits = (unsigned int) (word & 0x1);
    if (bits <= n) {
        n -= bits;
        word >>= 1;
        sum += 1;
    }
    
    bits = (unsigned int) (word & 0x1);
    if (bits <= n) {
        n -= bits;
        sum += 1;
    }
    
    if(!n)
        return sum;

    return BITS_PER_LONG;
}


struct cycletime
{
    unsigned long cycle;
    double time;
};


int main()
{
    unsigned long limit = 2000000000;
    struct cycletime avgrec1 = {.cycle = 0, .time = 0.0};
    struct cycletime avgrec2 = {.cycle = 0, .time = 0.0};
    unsigned long test_t = 200;
    for(unsigned long idx = 0; idx < 64; idx++){
        for(unsigned long a = 0; a < test_t; a++){
            // Allocate memory for the array
            unsigned long *bit_array = malloc(BITS_TO_LONGS(ARRAY_SIZE) * sizeof(unsigned long));
            if (!bit_array) {
                perror("Memory allocation failed");
                return EXIT_FAILURE;
            }
            // Initialize random seed
            srand(a);
            unsigned long t = 0;
            // Set random bits
            for (unsigned long i = 0; i < ARRAY_SIZE; ++i) {
                unsigned long word_idx = BIT_WORD(i);
                unsigned long bit_idx = i & (BITS_PER_LONG - 1);

                if (rand() % 2) {
                    bit_array[word_idx] |= (1UL << bit_idx);
                }
                else
                    bit_array[word_idx] ^= (bit_array[word_idx] & (1UL << bit_idx));
            }
            unsigned long bound = BIT_WORD(ARRAY_SIZE);
            unsigned long r1[bound];
            unsigned long r2[bound];
            clock_t start, end;
            unsigned long long cycles_start, cycles_end;

            start = clock();
            asm volatile("rdtsc" : "=A" (cycles_start));
            for (unsigned long i = 0; i < bound; ++i) {
                r1[i] = fns(bit_array[i], idx);
            }
            asm volatile("rdtsc" : "=A" (cycles_end));
            end = clock();
            avgrec1.time += (double)(end - start) / CLOCKS_PER_SEC;
            if((cycles_end - cycles_start) < limit)
	    	avgrec1.cycle += (cycles_end - cycles_start);
            // printf("original ver.=========================================================================================\n");
            // printf("time : %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
            // printf("CPU cycles : %llu cycles\n", cycles_end - cycles_start);

            start = clock();
            asm volatile("rdtsc" : "=A" (cycles_start));
            for (unsigned long i = 0; i < bound; ++i) {
                r2[i] = fns2(bit_array[i], idx);
            }
            asm volatile("rdtsc" : "=A" (cycles_end));
            end = clock();
            avgrec2.time += (double)(end - start) / CLOCKS_PER_SEC;
	    if((cycles_end - cycles_start) < limit)
            	avgrec2.cycle += (cycles_end - cycles_start);
            // printf("new ver.==============================================================================================\n");
            // printf("time : %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
            // printf("CPU cycles : %llu cycles\n", cycles_end - cycles_start);

            for (unsigned long i = 0; i < bound; ++i) {
                if(r2[i] != r1[i]){
                    printf("Different in %ld, %lu vs %lu\n", i, r1[i], r2[i]);
                }
            }
            free(bit_array);
        }
	// printf("original ver.=========================================================================================\n");
    	// printf("Average time : %f seconds\n", avgrec1.time/(test_t*idx));
    	// printf("Average CPU cycles : %lu cycles\n", avgrec1.cycle/(test_t*idx));
    	// printf("new ver.==============================================================================================\n");
    	// printf("Average time : %f seconds\n", avgrec2.time/(test_t*idx));
    	// printf("Average CPU cycles : %lu cycles\n", avgrec2.cycle/(test_t*idx));
    }
    printf("original ver.=========================================================================================\n");
    printf("Average time : %f seconds\n", avgrec1.time/(test_t*64));
    printf("Average CPU cycles : %lu cycles\n", avgrec1.cycle/(test_t*64));
    printf("new ver.==============================================================================================\n");
    printf("Average time : %f seconds\n", avgrec2.time/(test_t*64));
    printf("Average CPU cycles : %lu cycles\n", avgrec2.cycle/(test_t*64));
    return EXIT_SUCCESS;
}

其中:
test_t 為測試輪數,即尋找 n-th set bit 多少次
idxn-th set bit
ARRAY_SIZE 為一連續記憶體空間,因此每一輪中皆會送入 1000000/64long 長度的 word 去尋找 n-th set bit

測試過程:

  1. 配置長度為 1000000 的記憶體空間並隨機填入 10
  2. 將每一個 long (這裡為64 bits) 長度的記憶體位置傳入 fns 作計算,同時為了公平,兩比較函式的每一輪 (1000000/64) 輸入都相同,並為了更好找到平均值,每一次尋找的 set bits 皆尋找 200 次,而 set bits 為能更全面,範圍為 0~63 。
  3. 共執行 (1000000/64)*64*200
  4. 同時為避免快但錯誤,在得到結果後會將兩者輸出做比較。

結果(每次可能不太相同,但仍有明顯差異,不開啟 O2 選項編譯的情況):

皆為尋找 長度 1000000 的記憶體位置 每一個 word 內的 n-th set bit

測量 32 bit
original ver.============================
Average time : 0.005153 seconds
Average CPU cycles : 18478354 cycles
new ver.=================================
Average time : 0.000790 seconds
Average CPU cycles : 2843377 cycles

測量 64 bit
original ver.============================
Average time : 0.007763 seconds
Average CPU cycles : 27781828 cycles
new ver.=================================
Average time : 0.000729 seconds
Average CPU cycles : 2620535 cycles

最後輸出正確
可以看到我改寫的版本在平均花費時間 (6.52x faster、10.64x faster) 與 cycle 數 (6.5x less、10.60x less) 皆比 linux/include/linux/bitops.h 裡的 fns 還要更佳

且也證明了 n 越大,差距越大,這裡因為是做 0 ~ 310 ~ 63 ,所以會被平均,不過仍可以看到 n 增大會使平均時間增加。
而 CPU cycles 會降低我認為是 branch miss 次數降低,隨著 n 加大,代表需要找的 set bits 更多,在我的版本中會更容易進入判斷式 (只要 bits <= n ,因為 n 加大) ,不過 Linux 核心內的版本就沒那麼幸運,因為他只是要找最低位的 set bit 位置,而與 n 的設定沒有關係,因此 branch prediction 很可能猜錯,透過結果也可以猜出 branch 大致上以預測為進入判斷式為主。

我的程式碼的優勢:

  1. 更快的執行時間,在 n 更大時,會有更大的差距
  2. 使用更少的判斷式
  3. 無需呼叫外部函示

因為並沒有實驗 branch prediction miss 的次數,因此不提到。

使用 perf 做比較

目前可以看到 branch-misses 是不支援的,原因似乎出自 CPU,等待我在其他裝置上嘗試

 Performance counter stats for './fns':

   <not supported>      cycles                                                                
         326980.52 msec cpu-clock                        #    1.000 CPUs utilized             
      326949836000 ns   user_time                        #  999.906 M/sec                     
           7998000 ns   system_time                      #   24.460 K/sec                     
         326981.93 msec task-clock                       #    1.000 CPUs utilized             
      327019524641 ns   duration_time                    #    1.000 G/sec                     
   <not supported>      instructions                   
   <not supported>      branches                       
   <not supported>      branch-misses  

     327.019524641 seconds time elapsed

     326.949836000 seconds user
       0.007998000 seconds sys

使用 $ perf record -e cycles ./fns

Samples: 1M of event 'cpu-clock:uH', Event count (approx.): 393692500000
Overhead  Com  Shared Object          Symbol
  31.54%  fns  fns                    [.] main         
  27.05%  fns  fns                    [.] __ffs       
  10.55%  fns  libc.so.6              [.] __random     
   8.62%  fns  libc.so.6              [.] __random_r   
   3.98%  fns  libc.so.6              [.] __aarch64_swp4_rel                                     
   3.72%  fns  libc.so.6              [.] __aarch64_cas4_acq                                     
   3.52%  fns  fns                    [.] fns2         
   3.21%  fns  fns                    [.] fns         
   2.89%  fns  fns                    [.] rand@plt     
   2.53%  fns  fns                    [.] __clear_bit

可以看到我的版本 fns2 較使用 __ffs__clear_bitfns ,使用更少的 cpu-clock

原圖來看,原始 fns branch prediction miss 的次數佔比非常高
截圖 2024-04-15 下午2.48.15

而我的版本在佔比最高的地方也並非是 branch prediction

截圖 2024-04-15 下午2.53.19

問題:

  1. 我留意到你提供的 online GDB 連結中並沒有添加任何編譯參數,但是 Linux 核心通常會在開啟 O2 選項的情況下編譯,這很可能會大幅影響所測量到的數值,不確定你測試時所採用的編譯參數為何?

起初並未知道此問題,因此僅以 $ gcc -c fns.c -o fns.o 進行編譯
補充實驗
使用以下編譯指令,並使用虛擬機器 ubuntu 22.04 執行

$ gcc -c -O2 fns.c -o fns.o
$ gcc fns.o -o fns
$ ./fns
original ver.==========================================
Average time : 0.002114 seconds
Average time CPU cycles : 7620518 cycles
new ver.===============================================
Average time : 0.000349 seconds
Average time CPU cycles : 1267415 cycles

可以看到效能差異減小至平均花費時間 (6.05x faster) 與 cycle 數 (6.01x less) ,為原始數據約 60% ,不過還是表現較佳。

改以原生 Linux 環境執行

original ver.==========================================
Average time : 0.002162 seconds
Average time CPU cycles : 4301711 cycles
new ver.===============================================
Average time : 0.000343 seconds
Average time CPU cycles : 682152 cycles

可以看到效能差異不大但還是有提升 (6.30x faster) 與 cycle 數 (6.30x less) ,不過 cycle 數大幅減少。

  1. 有些硬體有支援 ffs 操作,但有些硬體沒有支援所以採用軟體實作,但我目前沒看到針對軟體/硬體實作分別的測試結果。

想問是指 ffs 有分為軟體實作,如下

static inline unsigned long __ffs(unsigned long word)
{
    int num = 0;
#if BITS_PER_LONG == 64
    if (!(word & 0xffffffff)) {
        num += 32;
        word >>= 32;
    }
#endif
    ...
}

及硬體實作,如下:

static inline unsigned long __ffs(unsigned long x)
{
	int ret;

	__asm__ ("l.ff1 %0,%1"
		 : "=r" (ret)
		 : "r" (x));

	return ret-1;
}

而硬體實作是以 inline assembly 實作,需要比較我的版本與硬體實作版本的差異嗎?
目前使用虛擬機器及原生 Linux 環境皆會出現以下錯誤

fns.c: Assembler messages:
fns.c:58: Error: no such instruction: `l.ff1 %ecx,%ecx'
fns.c:58: Error: no such instruction: `l.ff1 %ecx,%ecx'

然而我無論如何都查不到 l.ff1 %0,%1 的 assembly instruction。
以下為 __ffs 的 inline,

#ifdef CONFIG_OPENRISC_HAVE_INST_FF1

static inline unsigned long __ffs(unsigned long x)
{
	int ret;

	__asm__ ("l.ff1 %0,%1"
		 : "=r" (ret)
		 : "r" (x));

	return ret-1;
}

#else
#include <asm-generic/bitops/__ffs.h>
#endif

不過就找不到定義 CONFIG_OPENRISC_HAVE_INST_FF1 的地方了

最後結果:

__ffs 為使用 builtin funtion ,那麼結果反而會較差,我後來認為是即使判斷式較少,但因計算 1 的數量的程式碼為不斷地將 1 左移並做 & 運算,因此 cycle 數反而超過了少量判斷式省下來的 cycle 數而提升。因此邱冠維同學提出了更好的方法,經過與 linux 核心開發者及另一位 bitmap 的 reviewer 討論過後,目前可能的程式碼為

static inline unsigned long fns(unsigned long word, unsigned int n)
{
    while (word && n--)
        word &= word - 1;

    return word ? __ffs(word) : BITS_PER_LONG;
}

此方法利用了 Brian Kernighan’s Algorithm ,可以僅做一次 word &= word - 1 就達成原程式碼的 clear bit ,假如一 word 為 0b1000000 ,那麼僅須一次 word &= word - 1 即可以等同清除一位 set bit 而無須先透過 __ffs 計算最低位 set bit 並作清除的動作。但我的方法須使用以下程式碼做計算並得出 一定範圍內的 set bit ,而且會有重複計算相同範圍的問題(假設 32 位內的 set bit 過多,那麼不會進入判斷式,且在下一步會計算 16 位內的 set bit ,可以發現其實是會重複算最後的範圍)

#define __const_hweight8(w)                                                 \
    ((unsigned int) (!!((word) & (1ULL << 0))) + (!!((word) & (1ULL << 1))) + \
                     (!!((word) & (1ULL << 2))) + (!!((word) & (1ULL << 3)))) + \
                     (!!((word) & (1ULL << 4))) + (!!((word) & (1ULL << 5)))) + \
                     (!!((word) & (1ULL << 6))) + (!!((word) & (1ULL << 7)))) 

想當然前者會更為高效,不過當 n 越大時 (且要有足夠的 n ),會有相反的結果,以下為結果,其中 original 為邱同學的程式碼,而 new 則為我的:

Test n: 0
original ver.==========================================
Average time : 0.000021 seconds
Average time CPU cycles : 68181 cycles
new ver.===============================================
Average time : 0.000201 seconds
Average time CPU cycles : 720986 cycles

Test n: 1
original ver.==========================================
Average time : 0.000023 seconds
Average time CPU cycles : 75810 cycles
new ver.===============================================
Average time : 0.000238 seconds
Average time CPU cycles : 855627 cycles

Test n: 2
original ver.==========================================
Average time : 0.000026 seconds
Average time CPU cycles : 87842 cycles
new ver.===============================================
Average time : 0.000254 seconds
Average time CPU cycles : 913202 cycles

Test n: 3
original ver.==========================================
Average time : 0.000031 seconds
Average time CPU cycles : 103377 cycles
new ver.===============================================
Average time : 0.000275 seconds
Average time CPU cycles : 987046 cycles

Test n: 4
original ver.==========================================
Average time : 0.000032 seconds
Average time CPU cycles : 110543 cycles
new ver.===============================================
Average time : 0.000274 seconds
Average time CPU cycles : 983954 cycles

Test n: 5
original ver.==========================================
Average time : 0.000042 seconds
Average time CPU cycles : 144670 cycles
new ver.===============================================
Average time : 0.000280 seconds
Average time CPU cycles : 1004897 cycles

Test n: 6
original ver.==========================================
Average time : 0.000041 seconds
Average time CPU cycles : 141425 cycles
new ver.===============================================
Average time : 0.000295 seconds
Average time CPU cycles : 1061731 cycles

Test n: 7
original ver.==========================================
Average time : 0.000046 seconds
Average time CPU cycles : 161445 cycles
new ver.===============================================
Average time : 0.000308 seconds
Average time CPU cycles : 1107051 cycles

Test n: 8
original ver.==========================================
Average time : 0.000052 seconds
Average time CPU cycles : 179576 cycles
new ver.===============================================
Average time : 0.000313 seconds
Average time CPU cycles : 1121208 cycles

Test n: 9
original ver.==========================================
Average time : 0.000055 seconds
Average time CPU cycles : 193132 cycles
new ver.===============================================
Average time : 0.000315 seconds
Average time CPU cycles : 1130842 cycles

Test n: 10
original ver.==========================================
Average time : 0.000060 seconds
Average time CPU cycles : 210818 cycles
new ver.===============================================
Average time : 0.000324 seconds
Average time CPU cycles : 1164863 cycles

Test n: 11
original ver.==========================================
Average time : 0.000065 seconds
Average time CPU cycles : 228608 cycles
new ver.===============================================
Average time : 0.000336 seconds
Average time CPU cycles : 1206329 cycles

Test n: 12
original ver.==========================================
Average time : 0.000071 seconds
Average time CPU cycles : 251015 cycles
new ver.===============================================
Average time : 0.000350 seconds
Average time CPU cycles : 1257303 cycles

Test n: 13
original ver.==========================================
Average time : 0.000076 seconds
Average time CPU cycles : 266143 cycles
new ver.===============================================
Average time : 0.000353 seconds
Average time CPU cycles : 1267077 cycles

Test n: 14
original ver.==========================================
Average time : 0.000081 seconds
Average time CPU cycles : 284067 cycles
new ver.===============================================
Average time : 0.000362 seconds
Average time CPU cycles : 1298368 cycles

Test n: 15
original ver.==========================================
Average time : 0.000087 seconds
Average time CPU cycles : 306249 cycles
new ver.===============================================
Average time : 0.000376 seconds
Average time CPU cycles : 1351340 cycles

Test n: 16
original ver.==========================================
Average time : 0.000093 seconds
Average time CPU cycles : 328772 cycles
new ver.===============================================
Average time : 0.000378 seconds
Average time CPU cycles : 1359326 cycles

Test n: 17
original ver.==========================================
Average time : 0.000098 seconds
Average time CPU cycles : 345717 cycles
new ver.===============================================
Average time : 0.000370 seconds
Average time CPU cycles : 1329794 cycles

Test n: 18
original ver.==========================================
Average time : 0.000106 seconds
Average time CPU cycles : 373836 cycles
new ver.===============================================
Average time : 0.000380 seconds
Average time CPU cycles : 1363232 cycles

Test n: 19
original ver.==========================================
Average time : 0.000109 seconds
Average time CPU cycles : 385798 cycles
new ver.===============================================
Average time : 0.000382 seconds
Average time CPU cycles : 1372295 cycles

Test n: 20
original ver.==========================================
Average time : 0.000114 seconds
Average time CPU cycles : 404843 cycles
new ver.===============================================
Average time : 0.000382 seconds
Average time CPU cycles : 1371155 cycles

Test n: 21
original ver.==========================================
Average time : 0.000124 seconds
Average time CPU cycles : 440045 cycles
new ver.===============================================
Average time : 0.000397 seconds
Average time CPU cycles : 1426729 cycles

Test n: 22
original ver.==========================================
Average time : 0.000128 seconds
Average time CPU cycles : 453378 cycles
new ver.===============================================
Average time : 0.000395 seconds
Average time CPU cycles : 1419189 cycles

Test n: 23
original ver.==========================================
Average time : 0.000134 seconds
Average time CPU cycles : 475297 cycles
new ver.===============================================
Average time : 0.000416 seconds
Average time CPU cycles : 1493521 cycles

Test n: 24
original ver.==========================================
Average time : 0.000144 seconds
Average time CPU cycles : 509954 cycles
new ver.===============================================
Average time : 0.000421 seconds
Average time CPU cycles : 1511201 cycles

Test n: 25
original ver.==========================================
Average time : 0.000151 seconds
Average time CPU cycles : 536158 cycles
new ver.===============================================
Average time : 0.000419 seconds
Average time CPU cycles : 1491608 cycles

Test n: 26
original ver.==========================================
Average time : 0.000166 seconds
Average time CPU cycles : 591555 cycles
new ver.===============================================
Average time : 0.000429 seconds
Average time CPU cycles : 1539124 cycles

Test n: 27
original ver.==========================================
Average time : 0.000176 seconds
Average time CPU cycles : 627527 cycles
new ver.===============================================
Average time : 0.000424 seconds
Average time CPU cycles : 1523694 cycles

Test n: 28
original ver.==========================================
Average time : 0.000193 seconds
Average time CPU cycles : 687868 cycles
new ver.===============================================
Average time : 0.000417 seconds
Average time CPU cycles : 1496883 cycles

Test n: 29
original ver.==========================================
Average time : 0.000215 seconds
Average time CPU cycles : 768329 cycles
new ver.===============================================
Average time : 0.000410 seconds
Average time CPU cycles : 1473115 cycles

Test n: 30
original ver.==========================================
Average time : 0.000250 seconds
Average time CPU cycles : 891316 cycles
new ver.===============================================
Average time : 0.000407 seconds
Average time CPU cycles : 1460716 cycles

Test n: 31
original ver.==========================================
Average time : 0.000269 seconds
Average time CPU cycles : 963115 cycles
new ver.===============================================
Average time : 0.000387 seconds
Average time CPU cycles : 1389422 cycles

Test n: 32
original ver.==========================================
Average time : 0.000289 seconds
Average time CPU cycles : 1032171 cycles
new ver.===============================================
Average time : 0.000370 seconds
Average time CPU cycles : 1329986 cycles

Test n: 33
original ver.==========================================
Average time : 0.000305 seconds
Average time CPU cycles : 1091329 cycles
new ver.===============================================
Average time : 0.000353 seconds
Average time CPU cycles : 1268201 cycles

Test n: 34
original ver.==========================================
Average time : 0.000325 seconds
Average time CPU cycles : 1163132 cycles
new ver.===============================================
Average time : 0.000338 seconds
Average time CPU cycles : 1213680 cycles

Test n: 35
original ver.==========================================
Average time : 0.000346 seconds
Average time CPU cycles : 1239294 cycles
new ver.===============================================
Average time : 0.000321 seconds
Average time CPU cycles : 1153416 cycles

Test n: 36
original ver.==========================================
Average time : 0.000356 seconds
Average time CPU cycles : 1274277 cycles
new ver.===============================================
Average time : 0.000308 seconds
Average time CPU cycles : 1106007 cycles

Test n: 37
original ver.==========================================
Average time : 0.000362 seconds
Average time CPU cycles : 1298737 cycles
new ver.===============================================
Average time : 0.000295 seconds
Average time CPU cycles : 1060095 cycles

Test n: 38
original ver.==========================================
Average time : 0.000366 seconds
Average time CPU cycles : 1311189 cycles
new ver.===============================================
Average time : 0.000284 seconds
Average time CPU cycles : 1020473 cycles

Test n: 39
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1324439 cycles
new ver.===============================================
Average time : 0.000281 seconds
Average time CPU cycles : 1009790 cycles

Test n: 40
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1340945 cycles
new ver.===============================================
Average time : 0.000293 seconds
Average time CPU cycles : 1050948 cycles

Test n: 41
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1334657 cycles
new ver.===============================================
Average time : 0.000279 seconds
Average time CPU cycles : 1002511 cycles

Test n: 42
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1340435 cycles
new ver.===============================================
Average time : 0.000280 seconds
Average time CPU cycles : 1005596 cycles

Test n: 43
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1324166 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 992143 cycles

Test n: 44
original ver.==========================================
Average time : 0.000369 seconds
Average time CPU cycles : 1322804 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 1002429 cycles

Test n: 45
original ver.==========================================
Average time : 0.000375 seconds
Average time CPU cycles : 1340341 cycles
new ver.===============================================
Average time : 0.000283 seconds
Average time CPU cycles : 1015056 cycles

Test n: 46
original ver.==========================================
Average time : 0.000367 seconds
Average time CPU cycles : 1313704 cycles
new ver.===============================================
Average time : 0.000275 seconds
Average time CPU cycles : 996220 cycles

Test n: 47
original ver.==========================================
Average time : 0.000368 seconds
Average time CPU cycles : 1317041 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 994093 cycles

Test n: 48
original ver.==========================================
Average time : 0.000365 seconds
Average time CPU cycles : 1309510 cycles
new ver.===============================================
Average time : 0.000273 seconds
Average time CPU cycles : 980609 cycles

Test n: 49
original ver.==========================================
Average time : 0.000368 seconds
Average time CPU cycles : 1318358 cycles
new ver.===============================================
Average time : 0.000274 seconds
Average time CPU cycles : 984556 cycles

Test n: 50
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1326410 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 982680 cycles

Test n: 51
original ver.==========================================
Average time : 0.000366 seconds
Average time CPU cycles : 1312605 cycles
new ver.===============================================
Average time : 0.000273 seconds
Average time CPU cycles : 978370 cycles

Test n: 52
original ver.==========================================
Average time : 0.000368 seconds
Average time CPU cycles : 1316405 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 995060 cycles

Test n: 53
original ver.==========================================
Average time : 0.000370 seconds
Average time CPU cycles : 1324702 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 994500 cycles

Test n: 54
original ver.==========================================
Average time : 0.000372 seconds
Average time CPU cycles : 1333373 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 990131 cycles

Test n: 55
original ver.==========================================
Average time : 0.000369 seconds
Average time CPU cycles : 1323664 cycles
new ver.===============================================
Average time : 0.000281 seconds
Average time CPU cycles : 1007539 cycles

Test n: 56
original ver.==========================================
Average time : 0.000375 seconds
Average time CPU cycles : 1340914 cycles
new ver.==============================================
Average time : 0.000278 seconds
Average time CPU cycles : 996436 cycles

Test n: 57
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1338532 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 994950 cycles

Test n: 58
original ver.==========================================
Average time : 0.000376 seconds
Average time CPU cycles : 1344455 cycles
new ver.===============================================
Average time : 0.000278 seconds
Average time CPU cycles : 997226 cycles

Test n: 59
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1334166 cycles
new ver.===============================================
Average time : 0.000276 seconds
Average time CPU cycles : 979184 cycles

Test n: 60
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1336970 cycles
new ver.===============================================
Average time : 0.000280 seconds
Average time CPU cycles : 1006646 cycles

Test n: 61
original ver.==========================================
Average time : 0.000375 seconds
Average time CPU cycles : 1341625 cycles
new ver.===============================================
Average time : 0.000285 seconds
Average time CPU cycles : 1026516 cycles

Test n: 62
original ver.==========================================
Average time : 0.000373 seconds
Average time CPU cycles : 1333731 cycles
new ver.===============================================
Average time : 0.000277 seconds
Average time CPU cycles : 993799 cycles

Test n: 63
original ver.==========================================
Average time : 0.000374 seconds
Average time CPU cycles : 1340000 cycles
new ver.===============================================
Average time : 0.000286 seconds
Average time CPU cycles : 1025193 cycles

經由以上實驗,可以看到在 n 為 35 時,我的程式碼會開始達到較好的效能,不過因為我的程式碼計算過程非常固定,因此可以達到較為常數時間的執行時間,不會有太大幅度的變動,但邱同學的程式碼會與 n 的值成正比,最後會緩和則是因為我是採以隨機填入 1 或 0 的方式, set bits 的期望值應會約為 32 ,因此邱同學的版本花費時間未再上升。
雖然我的程式碼在 n 偏大時表現較佳,不過與版本相比在 n 篇小時會較差是最大的問題。

心得

透過這次 Jserv 老師與邱同學的討論對我有莫大的幫助,包括改進需做嚴謹的實驗是必須的,且參考網路上現有的程式碼是必要的,雖然之前在課堂作業有看過 Brian Kernighan’s Algorithm 的使用,此次改進時卻完全沒想到,但其實一上網搜尋 count set bits 就可以看到這個演算法,我甚至認為少量的判斷式就是最佳的,不過透過結果可以看到這是不成立的。這次我也深知自己能力的不足,學到如何向他人表達你的程式碼之所以更好,並能與他人進行有效率的溝通。

最後謝謝 Jserv 及 邱冠維同學

lookup table 及其他版本

unsigned int lut[16][4] = {{4, 4, 4, 4}, {16, 4, 4, 4}, {17, 4, 4, 4}, {32, 1, 4, 4}, \
                                {18, 4, 4, 4}, {32, 2, 4, 4}, {33, 2, 4, 4}, {48, 1, 2, 4}, \
                                {19, 4, 4, 4}, {32, 3, 4, 4}, {33, 3, 4, 4}, {48, 1, 3, 4}, \
                                {34, 3, 4, 4}, {48, 2, 3, 4}, {49, 2, 3, 4}, {64, 1, 2, 3}};

static inline unsigned int hweight4(uint8_t word)
{
        return lut[word & 0xf][0] >> 4;
}

static inline unsigned long fns4(uint8_t word, unsigned int n)
{
        return lut[word & 0xf][n] & 0xf;
}

static inline unsigned long fns8(uint8_t word, unsigned int n)
{
        unsigned int w = hweight4(word);

        return n < w ? fns4(word, n) : 4 + fns4(word >> 4, n - w);
}						

static inline unsigned long fns16(uint16_t word, unsigned int n)
{
     unsigned int w = __const_hweight8((uint8_t)word);

     return n < w ? fns8((uint8_t)word, n) : 8 + fns8((uint8_t)(word >> 8), n - w);
}

static inline unsigned long fns32(uint32_t word, unsigned int n)
{
     unsigned int w = __const_hweight16((uint16_t)word);

     return n < w ? fns16((uint16_t)word, n) : 16 + fns16((uint16_t)(word >> 16), n - w);
}

static inline unsigned long fns64(uint64_t word, unsigned int n)
{
     unsigned int w = __const_hweight32((uint32_t)word);

     return n < w ? fns32((uint32_t)word, n) : 32 + fns32((uint32_t)(word >> 32), n - w);
}

/**
* fns - find N'th set bit in a word
* @word: The word to search
* @n: Bit to find
* @n: Bit to find, counting from 0
*
* Returns N'th set bit. If no such bit found, returns >= BITS_PER_LONG
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
#if BITS_PER_LONG == 64
     return fns64(word, n);
#else
     return fns32(word, n);
#endif
}

Lookup table 版本1 不使用前面就計算過的版本

satic inline unsigned long fns(unsigned long word, unsigned int n)
{

	
    unsigned int pos = 0;
    unsigned int bits = 0;


#if BITS_PER_LONG == 64
    // find 32 bits
    bits = __const_hweight32(word); 
    if (bits <= n) {
        n -= bits;
        word >>= 32;
        pos += 32;
    }
#endif
	bits = __const_hweight16(word);
    if (bits <= n) {
        n -= bits;
        word >>= 16;
        pos += 16; 
    }

    bits = __const_hweight8(word);
    if (bits <= n) {
        n -= bits;
        word >>= 8;
        pos += 8;
    }

	return n<8? fbs_table[word & 0xff][n] + pos: BITS_PER_LONG+1;
}

Lookup table 版本2 使用前面計算過的版本

static inline unsigned long fns(unsigned long word, unsigned int n)
{	
    unsigned int pos = 0;
    unsigned int bits_higher_higher = 0;
    unsigned int bits_higher = 0;
    unsigned int bits_lower = 0;
    unsigned int bits_lower_lower = 0;
    unsigned int bits = 0;


#if BITS_PER_LONG == 64
    // find 32 bits
    bits_higher_higher = __const_hweight8(word>>24);
    bits_higher = __const_hweight8(word>>16);
    bits_lower = __const_hweight8(word>>8);
    bits_lower_lower = __const_hweight8(word);
    bits = bits_higher_higher+bits_higher+bits_lower+bits_lower_lower;
    
    if (bits <= n) 
        n -= bits;
        word >>= 32;
        pos += 32;
        
        // update 16 bits
        bits_lower = __const_hweight8(word>>8);
		bits_lower_lower = __const_hweight8(word);
    }
#else
    // find 16 bits
    bits_lower = __const_hweigh8(word>>8);
	bits_lower_lower = __const_hweight8(word);
#endif
	bits = bits_lower+bits_lower_lower;
    if (bits <= n) {
        n -= bits;
        word >>= 16;
        pos += 16; 
        
        bits_lower_lower = __const_hweight8(word);
    }

    bits = bits_lower_lower;
    
    if (bits <= n) {
        n -= bits;
        word >>= 8;
        pos += 8;
    }

    return n<8? fbs_table[word & 0xff][n] + pos: BITS_PER_LONG+1;
}

binary search 版本 1 不使用前面就計算過的版本

static inline unsigned long fns(unsigned long word, unsigned int n)
{
    unsigned int sum = 0;
    unsigned int bits = 0;
    
#if BITS_PER_LONG == 64
    bits = __const_hweight32(word);
    if (bits <= n) {
        n -= bits;
        word >>= 32;
        sum += 32;
    }
#endif

    bits = __const_hweight16(word);
    if (bits <= n) {
        n -= bits;
        word >>= 16;
        sum += 16;
    }
    
    bits = __const_hweight8(word);
    if (bits <= n) {
        n -= bits;
        word >>= 8;
        sum += 8;
    }

    bits = __const_hweight4(word);
    if (bits <= n) {
        n -= bits;
        word >>= 4;
        sum += 4;
    }

    bits = __const_hweight2(word);
    if (bits <= n) {
        n -= bits;
        word >>= 2;
        sum += 2;
    }
    
    bits = (unsigned int) (word & 0x1);
    if (bits <= n) {
        n -= bits;
        word >>= 1;
        sum += 1;
    }
    
    bits = (unsigned int) (word & 0x1);
    if (bits <= n) {
        n -= bits;
        sum += 1;
    }
    
    if(!n)
        return sum;

    return BITS_PER_LONG+1;
}

binary search 版本 2 使用前面計算過的版本

static inline unsigned long fns(unsigned long word, unsigned int n)
{
    unsigned int sum = 0;
    unsigned int bits_higher = 0;
    unsigned int bits_lower = 0;
    unsigned int bits = 0;


#if BITS_PER_LONG == 64
    // find 32 bits
    bits_higher = __const_hweight16(word>>16);
    bits_lower = __const_hweight16(word);
    bits = bits_higher+bits_lower;
    
    if (bits <= n) {
        n -= bits;
        word >>= 32;
        sum += 32;
        
        // update 16 bits
        bits_lower = __const_hweight16(word);
    }
#else
    // find 16 bits
    bits_lower = __const_hweight16(word)
#endif

    if (bits_lower <= n) {
        n -= bits_lower;
        word >>= 16;
        sum += 16; 
    }

    // find 8 bits
    bits_higher = __const_hweight4(word>>4);
    bits_lower = __const_hweight4(word);
    bits = bits_higher+bits_lower;
    	
    if (bits <= n) {
        n -= bits;
        word >>= 8;
        sum += 8;
        
        // update 4 bits
        bits_lower = __const_hweight4(word);
    }
    if (bits_lower <= n) {
        n -= bits_lower;
        word >>= 4;
        sum += 4;
    }
    
    // find 2 bits
    bits_higher = (unsigned int) ((word>>1) & 0x1); // 0
    bits_lower = (unsigned int) (word & 0x1); // 1
    bits = bits_higher+bits_lower; // 1
    
    if (bits <= n) {
        n -= bits;
        word >>= 2;
        sum += 2;
        
        // update 1 bit
        bits_lower = (unsigned int) (word & 0x1);
    }
    
    if (bits_lower <= n) {
        n -= bits_lower;
        word >>= 1;
        sum += 1;
        
        // update 1 bit
        bits_lower = (unsigned int) (word & 0x1);
    }
    
    if (bits_lower <= n) {
        n -= bits_lower;
        sum += 1;
    }

    if(!n)
        return sum;

    return BITS_PER_LONG+1;
}