Bit Twiddling Hacks

tags: `Computer Architecture` `Hacks`

在電腦的世界裡，可以做任何數目系統而且複雜的演算，但是大多數的演算都藉由軟體(程式)來解決，而非用硬體(電路)直接進行各種演算，電腦的硬體或其他數位電路在做算術運算時，最基本的電路往往只有二進位加法器而已，至於減法可藉由補數的加法解決，乘法等於連續的加法，除法則是連續的減法，可見加法器在運算數位系統中的重要性，但它也不過是幾個邏輯閘就解決的電路。

Compute the sign of an integer

#include <stdio.h>
#include <limits.h>

int v;
int sign;

int main(){
	
    // sign is either -1 or 0
    sign = -(v < 0);
    // to avoid branching
    sign = -(int)((unsigned int)((int)v) >> (sizeof(int) * CHAR_BIT - 1));
    // less instruction but not portable
    sign = v >> (sizeof(int) * CHAR_BIT - 1);	

    // sign is either -1 or +1
    sign = +1 | (v >> sizeof(int) * CHAR_BIT - 1);

    // sign is either -1, 0, or +1
    sign = (v != 0) | (v >> (sizeof(int) * CHAR_BIT - 1));
    // for portability, brevity, and perhaps speed
    sign = (v > 0) - (v < 0);

    // sign is either 0 or 1
    sign = 1 ^ ((unsigned int)((int)v) >> (sizeof(int) * CHAR_BIT - 1));	
	
    return 0;
}

Detect if two integers have opposite signs

int x, y;               // input values to compare signs

bool f = ((x ^ y) < 0); // true iff x and y have opposite signs

Rule:
^ the left most digit

Compute the integer absolute value (abs) without branching

int v;           // we want to find the absolute value of v
unsigned int r;  // the result goes here 

int const mask = v >> sizeof(int) * CHAR_BIT - 1;
r = (v + mask) ^ mask;

// Patented variation:
r = (v ^ mask) - mask;

Rule:

If v < 0 then mask must be -1.
And (v - 1) ^ -1 must be ~(v - 1).
If v > 0 then mask must be 0.
And v ^ 0 must be v.

Compute the minimum (min) or maximum (max) of two integers without branching

int x;  // we want to find the minimum of x and y
int y;   
int r;  // the result goes here 

r = y ^ ((x ^ y) & -(x < y)); // min(x, y)
r = x ^ ((x ^ y) & -(x < y)); // max(x, y)

// Quick and dirty versions:
// INT_MIN <= x - y <= INT_MAX 
// (x - y) only needs to be evaluated once.
r = y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
r = x - ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // max(x, y)

Rule:

If (x - y) > 0 then ((x - y) >> (sizeof(int) * CHAR_BIT - 1))) must be 0.
And (x - y) & 0 must be 0.
If (x - y) < 0 then ((x - y) >> (sizeof(int) * CHAR_BIT - 1))) must be -1.
And (x - y) & -1 must be (x - y).

Determining if an integer is a power of 2

unsigned int v; // we want to see if v is a power of 2
bool f;         // the result goes here 

f = v && !(v & (v - 1));

e.g.

4 & 3 == 0
3 & 2 != 0

$(4)_{10}$	$(100)_{2}$
$(3)_{10}$	$(011)_{2}$
$(2)_{10}$	$(010)_{2}$

Sign extending from a variable bit-width

Sometimes we need to extend the sign of a number but we don't know a priori the number of bits, b, in which it is represented.

unsigned b; // number of bits representing the number in x
int x;      // sign extend this b-bit number to r
int r;      // resulting sign-extended number
int const m = 1U << (b - 1); // mask can be pre-computed if b is fixed

x = x & ((1U << b) - 1);  // (Skip this if bits in x above position b are already zero.)
r = (x ^ m) - m;

// A slightly faster but less portable method:
int const m = CHAR_BIT * sizeof(x) - b;
r = (x << m) >> m;

Conditionally set or clear bits without branching

bool f;         // conditional flag
unsigned int m; // the bit mask
unsigned int w; // the word to modify:  if (f) w |= m; else w &= ~m; 

w ^= (-f ^ w) & m;

// OR, for superscalar CPUs:
w = (w & ~m) | (-f & m);

Conditionally negate a value without branching

If you need to negate only when a flag is false, then use the following to avoid branching.

bool fDontNegate;  // Flag indicating we should not negate v.
int v;             // Input value to negate if fDontNegate is false.
int r;             // result = fDontNegate ? v : -v;

r = (fDontNegate ^ (fDontNegate - 1)) * v;

If you need to negate only when a flag is true, then use this.

bool fNegate;  // Flag indicating if we should negate v.
int v;         // Input value to negate if fNegate is true.
int r;         // result = fNegate ? -v : v;

r = (v ^ -fNegate) + fNegate;

Merge bits from two values according to a mask

unsigned int a;    // value to merge in non-masked bits
unsigned int b;    // value to merge in masked bits
unsigned int mask; // 1 where bits from b should be selected; 0 where from a.
unsigned int r;    // result of (a & ~mask) | (b & mask) goes here

r = a ^ ((a ^ b) & mask);

Counting bits set (naive way)

unsigned int v; // count the number of bits set in v
unsigned int c; // c accumulates the total bits set in v

for (c = 0; v; v >>= 1)
{
  c += v & 1;
}

Counting bits set (Brian Kernighan's way)

unsigned int v; // count the number of bits set in v
unsigned int c; // c accumulates the total bits set in v
for (c = 0; v; c++)
{
  v &= v - 1; // clear the least significant bit set
}

Count bits set (rank) from the most-significant bit upto a given position

The following finds the the rank of a bit, meaning it returns the sum of bits that are set to 1 from the most-signficant bit downto the bit at the given position.

uint64_t v;       // Compute the rank (bits set) in v from the MSB to pos.
unsigned int pos; // Bit position to count bits upto.
uint64_t r;       // Resulting rank of bit at pos goes here.

// Shift out bits after given position.
r = v >> (sizeof(v) * CHAR_BIT - pos);
r = r - ((r >> 1) & ~0UL/3);
r = (r & ~0UL/5) + ((r >> 2) & ~0UL/5);
r = (r + (r >> 4)) & ~0UL/17;
r = (r * (~0UL/255)) >> ((sizeof(v) - 1) * CHAR_BIT);

Select the bit position (from the most-significant bit) with the given count (rank)

The following 64-bit code selects the position of the rth 1 bit when counting from the left. In other words if we start at the most significant bit and proceed to the right, counting the number of bits set to 1 until we reach the desired rank, r, then the position where we stop is returned. If the rank requested exceeds the count of bits set, then 64 is returned. The code may be modified for 32-bit or counting from the right. If branching is fast on your target CPU, consider uncommenting the if-statements and commenting the lines that follow them.

uint64_t v;          // Input value to find position with rank r.
unsigned int r;      // Input: bit's desired rank [1-64].
unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
uint64_t a, b, c, d; // Intermediate temporaries for bit count.
unsigned int t;      // Bit count temporary.

a =  v - ((v >> 1) & ~0UL/3);
b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
c = (b + (b >> 4)) & ~0UL/0x11;
d = (c + (c >> 8)) & ~0UL/0x101;
t = (d >> 32) + (d >> 48);

// Now do branchless select!
s  = 64;
// if (r > t) {s -= 32; r -= t;}
s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
t  = (d >> (s - 16)) & 0xff;
// if (r > t) {s -= 16; r -= t;}
s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
t  = (c >> (s - 8)) & 0xf;
// if (r > t) {s -= 8; r -= t;}
s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
t  = (b >> (s - 4)) & 0x7;
// if (r > t) {s -= 4; r -= t;}
s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
t  = (a >> (s - 2)) & 0x3;
// if (r > t) {s -= 2; r -= t;}
s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
t  = (v >> (s - 1)) & 0x1;
// if (r > t) s--;
s -= ((t - r) & 256) >> 8;
s = 65 - s;

Computing parity the naive way

A parity bit, or check bit, is a bit added to a string of binary code to ensure that the total number of 1-bits in the string is even or odd. Parity bits are used as the simplest form of error detecting code.

unsigned int v;       // word value to compute the parity of
bool parity = false;  // parity will be the parity of v

while (v)
{
  parity = !parity;
  v = v & (v - 1);
}

Swapping values with subtraction and addition

This swaps the values of a and b without using a temporary variable. Don't use this with floating-point numbers (unless you operate on their raw integer representations).

#define SWAP(a, b) ((&(a) == &(b)) || \
                    (((a) -= (b)), ((b) += (a)), ((a) = (b) - (a))))

Swapping values with XOR

This is an old trick to exchange the values of the variables a and b without using extra space for a temporary variable.

#define SWAP(a, b) (((a) == (b)) || (((a) ^= (b)), ((b) ^= (a)), ((a) ^= (b))))

Swapping individual bits with XOR

As an example of swapping ranges of bits suppose we have have b = 00101111 (expressed in binary) and we want to swap the n = 3 consecutive bits starting at i = 1 (the second bit from the right) with the 3 consecutive bits starting at j = 5; the result would be r = 11100011 (binary).

unsigned int i, j; // positions of bit sequences to swap
unsigned int n;    // number of consecutive bits in each sequence
unsigned int b;    // bits to swap reside in b
unsigned int r;    // bit-swapped result goes here

unsigned int x = ((b >> i) ^ (b >> j)) & ((1U << n) - 1); // XOR temporary
r = b ^ ((x << i) | (x << j));

Reverse bits the obvious way

unsigned int v;     // input bits to be reversed
unsigned int r = v; // r will be reversed bits of v; first get LSB of v
int s = sizeof(v) * CHAR_BIT - 1; // extra shift needed at end

for (v >>= 1; v; v >>= 1)
{   
  r <<= 1;
  r |= v & 1;
  s--;
}
r <<= s; // shift when v's highest bits are zero

Compute modulus division by 1 << s without a division operator

const unsigned int n;          // numerator
const unsigned int s;
const unsigned int d = 1U << s; // So d will be one of: 1, 2, 4, 8, 16, 32, ...
unsigned int m;                // m will be n % d
m = n & (d - 1);

Compute modulus division by (1 << s) - 1 without a division operator

unsigned int n;                      // numerator
const unsigned int s;                // s > 0
const unsigned int d = (1 << s) - 1; // so d is either 1, 3, 7, 15, 31, ...).
unsigned int m;                      // n % d goes here.

for (m = n; n > d; n = m)
{
  for (m = 0; n; n >>= s)
  {
    m += n & d;
  }
}
// Now m is a value from 0 to d, but since with modulus division
// we want m to be 0 when it is d.
m = m == d ? 0 : m;

Find the log base 2 of an integer with the MSB N set in O(N) operations (the obvious way)

The log base 2 of an integer is the same as the position of the highest bit set (or most significant bit set, MSB).

unsigned int v; // 32-bit word to find the log base 2 of
unsigned int r = 0; // r will be lg(v)

while (v >>= 1) // unroll for more speed...
{
  r++;
}

Find the log base 2 of an N-bit integer in O(lg(N)) operations

unsigned int v;  // 32-bit value to find the log2 of 
const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
const unsigned int S[] = {1, 2, 4, 8, 16};
int i;

register unsigned int r = 0; // result of log2(v) will go here
for (i = 4; i >= 0; i--) // unroll for speed...
{
  if (v & b[i])
  {
    v >>= S[i];
    r |= S[i];
  } 
}


// OR (IF YOUR CPU BRANCHES SLOWLY):

unsigned int v;	         // 32-bit value to find the log2 of 
register unsigned int r; // result of log2(v) will go here
register unsigned int shift;

r =     (v > 0xFFFF) << 4; v >>= r;
shift = (v > 0xFF  ) << 3; v >>= shift; r |= shift;
shift = (v > 0xF   ) << 2; v >>= shift; r |= shift;
shift = (v > 0x3   ) << 1; v >>= shift; r |= shift;
                                        r |= (v >> 1);

Find the log base 2 of an N-bit integer in O(lg(N)) operations with multiply and lookup

uint32_t v; // find the log base 2 of 32-bit v
int r;      // result goes here

static const int MultiplyDeBruijnBitPosition[32] = 
{
  0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
  8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
};

v |= v >> 1; // first round down to one less than a power of 2 
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;

r = MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];

Find integer log base 10 of an integer

The integer log base 10 is computed by first using one of the techniques above for finding the log base 2. By the relationship log10(v) = log2(v) / log2(10), we need to multiply it by 1/log2(10), which is approximately 1233/4096, or 1233 followed by a right shift of 12. Adding one is needed because the IntegerLogBase2 rounds down. Finally, since the value t is only an approximation that may be off by one, the exact value is found by subtracting the result of v < PowersOf10[t].

unsigned int v; // non-zero 32-bit integer value to compute the log base 10 of 
int r;          // result goes here
int t;          // temporary

static unsigned int const PowersOf10[] = 
    {1, 10, 100, 1000, 10000, 100000,
     1000000, 10000000, 100000000, 1000000000};

t = (IntegerLogBase2(v) + 1) * 1233 >> 12; // (use a lg2 method from above)
r = t - (v < PowersOf10[t]);

Find integer log base 10 of an integer the obvious way

This method works well when the input is uniformly distributed over 32-bit values because 76% of the inputs are caught by the first compare, 21% are caught by the second compare, 2% are caught by the third, and so on (chopping the remaining down by 90% with each comparision). As a result, less than 2.6 operations are needed on average.

unsigned int v; // non-zero 32-bit integer value to compute the log base 10 of 
int r;          // result goes here

r = (v >= 1000000000) ? 9 : (v >= 100000000) ? 8 : (v >= 10000000) ? 7 : 
    (v >= 1000000) ? 6 : (v >= 100000) ? 5 : (v >= 10000) ? 4 : 
    (v >= 1000) ? 3 : (v >= 100) ? 2 : (v >= 10) ? 1 : 0;

Count the consecutive zero bits (trailing) on the right linearly

unsigned int v;  // input to count trailing zero bits
int c;  // output: c will count v's trailing zero bits,
        // so if v is 1101000 (base 2), then c will be 3
if (v)
{
  v = (v ^ (v - 1)) >> 1;  // Set v's trailing 0s to 1s and zero rest
  for (c = 0; v; c++)
  {
    v >>= 1;
  }
}
else
{
  c = CHAR_BIT * sizeof(v);
}

Count the consecutive zero bits (trailing) on the right by binary search

unsigned int v;     // 32-bit word input to count zero bits on right
unsigned int c;     // c will be the number of zero bits on the right,
                    // so if v is 1101000 (base 2), then c will be 3
// NOTE: if 0 == v, then c = 31.
if (v & 0x1) 
{
  // special case for odd v (assumed to happen half of the time)
  c = 0;
}
else
{
  c = 1;
  if ((v & 0xffff) == 0) 
  {  
    v >>= 16;  
    c += 16;
  }
  if ((v & 0xff) == 0) 
  {  
    v >>= 8;  
    c += 8;
  }
  if ((v & 0xf) == 0) 
  {  
    v >>= 4;
    c += 4;
  }
  if ((v & 0x3) == 0) 
  {  
    v >>= 2;
    c += 2;
  }
  c -= v & 0x1;
}

Count the consecutive zero bits (trailing) on the right with modulus division and lookup

It makes use of the fact that the first 32 bit position values are relatively prime with 37, so performing a modulus division with 37 gives a unique number from 0 to 36 for each. These numbers may then be mapped to the number of zeros using a small lookup table.

unsigned int v;  // find the number of trailing zeros in v
int r;           // put the result in r
static const int Mod37BitPosition[] = // map a bit value mod 37 to its position
{
  32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13, 4,
  7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9, 5,
  20, 8, 19, 18
};
r = Mod37BitPosition[(-v & v) % 37];

Count the consecutive zero bits (trailing) on the right with multiply and lookup

Converting bit vectors to indices of set bits is an example use for this. It requires one more operation than the earlier one involving modulus division, but the multiply may be faster. The expression (v & -v) extracts the least significant 1 bit from v. The constant 0x077CB531UL is a de Bruijn sequence, which produces a unique pattern of bits into the high 5 bits for each possible bit position that it is multiplied against. When there are no bits set, it returns 0.

unsigned int v;  // find the number of trailing zeros in 32-bit v 
int r;           // result goes here
static const int MultiplyDeBruijnBitPosition[32] = 
{
  0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 
  31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
};
r = MultiplyDeBruijnBitPosition[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];

Count the consecutive zero bits (trailing) on the right by casting to a float

The time to convert an integer to a float can be high on some machines. The exponent of the 32-bit IEEE floating point representation is shifted down, and the bias is subtracted to give the position of the least significant 1 bit set in v. If v is zero, then the result is -127.

unsigned int v;            // find the number of trailing zeros in v
int r;                     // the result goes here
float f = (float)(v & -v); // cast the least significant bit in v to a float
r = (*(uint32_t *)&f >> 23) - 0x7f;

e.g.

Scientific notation is

6 = (110)_{2} = (1.1)_{2} \times 2^{(10)_{2}}

IEEE 754 floating-point standard is

- 1^{S} \times (1.0 + 0. M) \times 2^{E - b i a s}

32-bit IEEE floating point representation is

Sign	Exponent	Mantissa
+	2 + 127	1
0	10000001	1000000 00000000 00000000

Round up to the next highest power of 2 by float casting

unsigned int const v; // Round this 32-bit value to the next highest power of 2
unsigned int r;       // Put the result here. (So v=3 -> r=4; v=8 -> r=8)

if (v > 1) 
{
  float f = (float)v;
  unsigned int const t = 1U << ((*(unsigned int *)&f >> 23) - 0x7f);
  r = t << (t < v);
}
else 
{
  r = 1;
}

Round up to the next highest power of 2

The result may be expressed by the formula 1U << (lg(v - 1) + 1).
(It would be faster by 2 operations to use the formula and the log base 2 method that uses a lookup table, but in some situations, lookup tables are not suitable.)

It works by copying the highest set bit to all of the lower bits, and then adding one, which results in carries that set all of the lower bits to 0 and one bit beyond the highest set bit to 1. If the original number was a power of 2, then the decrement will reduce it to one less, so that we round up to the same original value.

unsigned int v; // compute the next highest power of 2 of 32-bit v

int main(){
    v = 0;
    v--;
    v |= v >> 1;
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;
    v++;
    v += (v == 0);
    return 0;
}

Interleave bits the obvious way

Interleaved bits (aka Morton numbers) are useful for linearizing 2D integer coordinates, so x and y are combined into a single number that can be compared easily and has the property that a number is usually close to another if their x and y values are close.

e.g.
If you want to convert a certain set of integer coordinates to a Morton code, you have to convert the decimal values to binary and interleave the bits of each coordinate:

(x,y) = (5,9) = (0101,1001)
Interleaving the bits results in: 10010011 = 147

unsigned short x;   // Interleave bits of x and y, so that all of the
unsigned short y;   // bits of x are in the even positions and y in the odd;
unsigned int z = 0; // z gets the resulting Morton Number.

for (int i = 0; i < sizeof(x) * CHAR_BIT; i++) // unroll for more speed...
{
  z |= (x & 1U << i) << i | (y & 1U << i) << (i + 1);
}

Interleave bits by Binary Magic Numbers

static const unsigned int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
static const unsigned int S[] = {1, 2, 4, 8};

unsigned int x; // Interleave lower 16 bits of x and y, so the bits of x
unsigned int y; // are in the even positions and bits from y in the odd;
unsigned int z; // z gets the resulting 32-bit Morton Number.  
                // x and y must initially be less than 65536.

x = (x | (x << S[3])) & B[3];
x = (x | (x << S[2])) & B[2];
x = (x | (x << S[1])) & B[1];
x = (x | (x << S[0])) & B[0];

y = (y | (y << S[3])) & B[3];
y = (y | (y << S[2])) & B[2];
y = (y | (y << S[1])) & B[1];
y = (y | (y << S[0])) & B[0];

z = x | (y << 1);

Compute the lexicographically next bit permutation

Suppose we have a pattern of N bits set to 1 in an integer and we want the next permutation of N 1 bits in a lexicographical sense.

For example, if N is 3 and the bit pattern is 00010011, the next patterns would be 00010101, 00010110, 00011001,00011010, 00011100, 00100011, and so forth. The following is a fast way to compute the next permutation.

unsigned int v; // current permutation of bits 
unsigned int w; // next permutation of bits

unsigned int t = v | (v - 1); // t gets v's least significant 0 bits set to 1
// Next set to 1 the most significant bit to change, 
// set to 0 the least significant ones, and add the necessary 1 bits.
w = (t + 1) | (((~t & -~t) - 1) >> (__builtin_ctz(v) + 1));

The __builtin_ctz(v) GNU C compiler intrinsic for x86 CPUs returns the number of trailing zeros.

unsigned int t = (v | (v - 1)) + 1;  
w = t | ((((t & -t) / (v & -v)) >> 1) - 1);

Determine if a word has a zero byte

// Fewer operations:
unsigned int v; // 32-bit word to check if any 8-bit byte in it is 0
bool hasZeroByte = ~((((v & 0x7F7F7F7F) + 0x7F7F7F7F) | v) | 0x7F7F7F7F);

// More operations:
bool hasNoZeroByte = ((v & 0xff) && (v & 0xff00) && (v & 0xff0000) && (v & 0xff000000))

// OR:
unsigned char * p = (unsigned char *) &v;  
bool hasNoZeroByte = *p && *(p + 1) && *(p + 2) && *(p + 3);

There is yet a faster method — use hasless(v, 1), which is defined below.

#define haszero(v) (((v) - 0x01010101UL) & ~(v) & 0x80808080UL)

Determine if a word has a byte equal to n

We may want to know if any byte in a word has a specific value. To do so, we can XOR the value to test with a word that has been filled with the byte values in which we're interested. Because XORing a value with itself results in a zero byte and nonzero otherwise, we can pass the result to haszero.

#define hasvalue(x,n) \
(haszero((x) ^ (~0UL/255 * (n))))

Determine if a word has a byte less than n

Requirements: x>=0; 0<=n<=128

#define hasless(x,n) (((x)-~0UL/255*(n))&~(x)&~0UL/255*128)

To count the number of bytes in x

#define countless(x,n) \
(((~0UL/255*(127+(n))-((x)&~0UL/255*127))&~(x)&~0UL/255*128)/128%255)

Determine if a word has a byte greater than n

Requirements: x>=0; 0<=n<=127

#define hasmore(x,n) (((x)+~0UL/255*(127-(n))|(x))&~0UL/255*128)