Try   HackMD

2020q3 Homework3 (quiz3)

contributed by < Rsysz >

tags: sysprog

測驗題

1.

根據C99標準的6.5.7節

“The result of E1 >> E2 is E1 right-shifted E2 bit positions. … If E1 has a signed type and a negative value, the resulting value is implementation-defined.

對有號型態的物件進行算術右移(Arithmetic Shift Right)屬於 Unspecified Behavior
因此想實作一套可跨平台的ASR,代表當輸入為負數時右移必須補 1 ,反之則必須補 0

int asr_i(signed int m, unsigned int n)
{
    const int logical = (((int) -1) OP1) > 0;
    unsigned int fixu = -(logical & (OP2));
    int fix = *(int *) &fixu;
    return (m >> n) | (fix ^ (fix >> n));
}

根據題意,已知m為輸入數值,n為右移次數,由 return 回推

  • 假設 m 為正數若 m >> n 為補0,則 fix ^ (fix >> n) = 0
    fix = 0 相當於 fixu = 0 代表 logical 必須等於 0
    因此得知 logical 為測試此平台上負數右移補 0 或 1 , OP1 = (b) >> 1
  • 假設 m 為負數若 m >> n 為補0,則 fix ^ (fix >> n) 必須補上對應 n 位的 1
    logical 為 1 代表右移補 0 , fix = -OP2m < 0fix 必須等於 -1
    因此 OP2 = (c) m < 0






ASR



start

start



init

input value = m, shift value = n



start->init





shift

(((int) -1) >> 1) > 0



init->shift





m

m < 0



init->m





fix

fix = fixu = -(logical & (m < 0))



shift->fix:w





m_true

m > 0, logical = 0 or 1
  m < 0, logical = 0
  fix = 0



fix->m_true





m_false

m < 0, logical = 1
  fix = -1



fix->m_false





m->fix:e





return

 (m >> n) | (fix ^ (fix >> n)) 



m_true->return





m_false->return





end

end



return->end





5arith1為例, m = -5 , n = 1 若假設右移為 邏輯右移







SR



struct1

1

1

1

1

1

1

1

1



struct2

0

1

1

1

1

1

1

1



struct1:f0->struct2:f1





struct1:f1->struct2:f2





struct1:f2->struct2:f3





struct1:f3->struct2:f4





struct1:f4->struct2:f5





struct1:f5->struct2:f6





struct1:f6->struct2:f7





int
((int) -1)



logica

logical = 1



struct2:e->logica:w





int_s
((int) -1) >> 1



m 為負數, fix = fixu = -1







SR



m

1

1

1

1

1

0

1

1



n

0

1

1

1

1

1

0

1



m:f0->n:f1





m:f1->n:f2





m:f2->n:f3





m:f3->n:f4





m:f4->n:f5





m:f5->n:f6





m:f6->n:f7





m_text_1
m(-5)



result

1

1

1

1

1

1

0

1



n->result





m_text_2
m >> n(-5 >> 1)



result_text
(m >> n) | (fix ^ (fix >> 1)) (-3)



struct1

1

1

1

1

1

1

1

1



struct2

0

1

1

1

1

1

1

1



struct1:f0->struct2:f1





struct1:f1->struct2:f2





struct1:f2->struct2:f3





struct1:f3->struct2:f4





struct1:f4->struct2:f5





struct1:f5->struct2:f6





struct1:f6->struct2:f7





fix
fix



struct3

1

0

0

0

0

0

0

0



struct2:f0->struct3:f0





struct2:f1->struct3:f1





struct2:f2->struct3:f2





struct2:f3->struct3:f3





struct2:f4->struct3:f4





struct2:f5->struct3:f5





struct2:f6->struct3:f6





struct2:f7->struct3:f7





fix_s
fix >> 1



fix_xor
fix ^ (fix >> 1)



2.

bool isPowerOfFour(int num)
{
    return num > 0 && (num & (num - 1))==0 &&
           !(__builtin_ctzl(num) OPQ);  
}

4n(n)0,因此 num > 0
4n22nnum2n4n
, 因此 (num & (num - 1)) == 0

Built-in Function: int __builtin_ctz (unsigned int x)

  • Returns the number of trailing 0-bits in x, starting at the least significant bit position. If x is 0, the result is undefined.

Built-in Function: int __builtin_clz (unsigned int x)

  • Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.
4n
Binary
40
0000 0001
41
0000 0100
42
0001 0000

已知

4n=0000....01002n0
!(__builtin_ctz(num) OPQ) 確保尾端數來皆是偶數個 0 排除 2 的次方數
因此 OPQ = (f) & 0x1

延伸問題

bool isPowerOfFour(int num)
{   
     return (num & 0x55555555) && (!(num & (num - 1)));
}

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

透過 0x55555555 確保 num 的二進位只有奇數位有值,又 4 的次方數轉為二進位只有一個 1 於是再排除有複數位的情況。

int bitwiseComplement(int N){
    if(N)
        return ~N & ((1 << (32 - __builtin_clz(N))) - 1);
    return 1;        
}

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

找出 MSB 後有效位並對 1 求補數,利用 1 << (32 - __builtin_clz(N)) - 1 製造有效位 Mask 並求剩下的數。

#define SWAP(a, b) ((a) ^= (b), (b) ^= (a), (a) ^= (b)) 

int firstMissingPositive(int* nums, int numsSize){
    for(int i = 0; i < numsSize; i++){
        if(nums[i] > 0 && nums[i] < numsSize && nums[i] != nums[nums[i] - 1]){
            int j = nums[i] - 1;
            SWAP(nums[i], nums[j]);
            i--;                                            
        }
    }
    for(int i = 0; i < numsSize; i++){
        if(nums[i] != i + 1)
            return i + 1;        
    }
        return numsSize + 1;
}

參考SwappingValuesXOR

3.

針對 LeetCode 1342. Number of Steps to Reduce a Number to Zero,我們可運用 gcc 內建函式實作,假設 int 寬度為 32 位元,程式碼列表如下:

int numberOfSteps (int num)
{
    return num ? AAA + 31 - BBB : 0;
}

__builtin_popcount(x): 計算 x 的二進位表示中,總共有幾個 1

參考題二得知對應的__builtin_clz(num)為計算 MSB 開始到第一個 1 之間 0 的個數

透過32 - __builtin_clz(num)求得 MSB 與 0 的距離,再以 __builtin_popcount(x) - 1 求得 MSB 以外的 1 與 0 的距離,因此 AAA = __builtin_popcount(num),BBB = __builtin_clz(num)

延伸問題

int numberOfSteps (int num)
{
    return __builtin_popcount(num) + 31 - __builtin_clz(num | 1);
}


__builtin_clz(num | 1) 針對 LSB 為 0 的情況加 1 ,而避免了傳入 0

int popcount(int num){
    /*__builtin_popcount(num)*/
    num = (num & 0x55555555) + ((num >> 1) & 0x55555555);
    num = (num & 0x33333333) + ((num >> 2) & 0x33333333);
    num = (num & 0x0F0F0F0F) + ((num >> 4) & 0x0F0F0F0F);
    num = (num & 0x00FF00FF) + ((num >> 8) & 0x00FF00FF);
    num = (num & 0x0000FFFF) + ((num >> 16) & 0x0000FFFF);
    return num;
}

int clz(int v){
    /*31 - __builtin_clz(num | 1)*/
    unsigned int r;
    unsigned int shift;
    r =     (v > 0xFFFF) << 4; v >>= r;
    shift = (v > 0xFF  ) << 3; v >>= shift; r |= shift;
    shift = (v > 0xF   ) << 2; v >>= shift; r |= shift;
    shift = (v > 0x3   ) << 1; v >>= shift; r |= shift;
                                            r |= (v >> 1);
    return r;
}

int numberOfSteps (int num)
{
    return popcount(num) + clz(num | 1);
}

byte 切割輸入值最終加總為 popcount ,而下方 clz 則可以計算 __builtin_clz 的相反數值

參考Find the log base 2 of an N-bit integer in O(lg(N)) operations

4.

考慮以下 64-bit GCD (greatest common divisor, 最大公因數) 求值函式:

#include <stdint.h>
uint64_t gcd64(uint64_t u, uint64_t v) {
    if (!u || !v) return u | v;
    while (v) {                               
        uint64_t t = v;
        v = u % v;
        u = t;
    }
    return u;
}

改寫為以下等價實作:

#include <stdint.h>
uint64_t gcd64(uint64_t u, uint64_t v) {
    if (!u || !v) return u | v;
    int shift;
    for (shift = 0; !((u | v) & 1); shift++) {
        u /= 2, v /= 2;
    }
    while (!(u & 1))
        u /= 2;
    do {
        while (!(v & 1))
            v /= 2;
        if (u < v) {
            v -= u;
        } else {
            uint64_t t = u - v;
            u = v;
            v = t;
        }
    } while (XXX);
    return YYY;
}

上述程式就是在實作輾轉相除法,首先透過 shift 判斷當 uv 都是偶數時,不斷除以 2 直到一方變成奇數,再對 u 單獨除到變為奇數,接著以不斷將兩數中較小的數置於 u 不斷以 v -= uv 取餘,因此最終XXX = (b) v YYY = (e) u << shift

以上 GCD 加速方法可以參考Binary GCD利用

binary gcd 的精神就是當兩數為偶數時,必有一

2 因子。
gcd(x,y)=2·gcd(x2,y2)

且一數為奇數另一數為偶數,則無
2
因子。
gcd(x,y)=gcd(x,y2)

於是我們可以改良為:
even_factor=min(ctz(x),ctz(y))

gcd(x,y)=even_factor·gcd((xeven_factor),(yeven_factor))

透過 shift 加速 GCD 運算,最終於 return u << shift 補回先前右移的數值

延伸問題

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

uint64_t gcd64(uint64_t u, uint64_t v) {
  unsigned t = u | v;
  if (!u || !v)
      return t;
  while (u) {
    u >>= __builtin_ctz(u);
    v >>= __builtin_ctz(v);
    if (u >= v)
      u = (u - v) / 2;
    else
      v = (v - u) / 2;
  }
  return (v << __builtin_ctz(t));
}

透過 __builtin_ctz 改寫除 2 部分加速整體運算速度,然而透過 perf stat 指令查看兩者在處理極大的值時消耗的時間相差甚少,因此於 main 新增

for (int i = 0; i < 0xFFF; i+=2) {
	for (int j = 0; j < 0xFFF; j+=2)
		gcd64(i, j);
}

以大量偶數數據帶入運算針對 __builtin_ctz 與題目所提供的 shift 方法比較計算速度

 Performance counter stats for './ex_builtin_ctz':

            218.58 msec task-clock                #    0.999 CPUs utilized
                 1      context-switches          #    0.005 K/sec
                 0      cpu-migrations            #    0.000 K/sec
                49      page-faults               #    0.224 K/sec
       9,0579,4689      cycles                    #    4.144 GHz
       6,9513,1948      instructions              #    0.77  insn per cycle
       1,2077,3811      branches                  #  552.541 M/sec
         1246,1088      branch-misses             #   10.32% of all branches

       0.218806228 seconds time elapsed

       0.218822000 seconds user
       0.000000000 seconds sys
 Performance counter stats for './ex_original':

            415.57 msec task-clock                #    0.999 CPUs utilized
                 1      context-switches          #    0.002 K/sec
                 0      cpu-migrations            #    0.000 K/sec
                48      page-faults               #    0.116 K/sec
      17,2396,2751      cycles                    #    4.148 GHz
      11,1716,9482      instructions              #    0.65  insn per cycle
       2,8125,5819      branches                  #  676.788 M/sec
         3811,1653      branch-misses             #   13.55% of all branches

       0.415865646 seconds time elapsed

       0.415875000 seconds user
       0.000000000 seconds sys

才能得到較為顯著的測試結果,採用 __builtin_ctz 明顯減少了 CPU cycles , fetch instructionsbranch 的次數,得到效能的提升。這裡值得注意的是如果使用 printf將得到的數值印出,將會消耗大量的時間,因此強烈不推薦使用。

5.

在影像處理中,bit array (也稱 bitset) 廣泛使用,考慮以下程式碼:

#include <stddef.h>
size_t naive(uint64_t *bitmap, size_t bitmapsize, uint32_t *out)
{
    size_t pos = 0;
    for (size_t k = 0; k < bitmapsize; ++k) {
        uint64_t bitset = bitmap[k];
        size_t p = k * 64;
        for (int i = 0; i < 64; i++) {
            if ((bitset >> i) & 0x1)
                out[pos++] = p + i;
        }
    }
    return pos;
}

可用 clz 改寫為效率更高且等價的程式碼:

#include <stddef.h>
size_t improved(uint64_t *bitmap, size_t bitmapsize, uint32_t *out)
{
    size_t pos = 0;
    uint64_t bitset;
    for (size_t k = 0; k < bitmapsize; ++k) {
        bitset = bitmap[k];
        while (bitset != 0) {
            uint64_t t = KKK;
            int r = __builtin_ctzll(bitset);
            out[pos++] = k * 64 + r;
            bitset ^= t;                           
        }
    }
    return pos;
}

上述程式碼相互對應後 int r = __builtin_ctzll 取代 if ((bitset >> i) & 0x1) 的功能,但改寫後的程式碼對 out[pos++] 賦值後還必須 clear bitset 的 LSB 以確保下個 Loop 的正確運行

根據題三提到

n=∼n+1

利用 bitset & -bitset 留下 LSB 再於 bitset ^= t 清除 LSB
因此 KKK = (e) bitset & -bitset