Computer Architecture
Hacks
在電腦的世界裡,可以做任何數目系統而且複雜的演算,但是大多數的演算都藉由軟體(程式)來解決,而非用硬體(電路)直接進行各種演算,電腦的硬體或其他數位電路在做算術運算時,最基本的電路往往只有二進位加法器而已,至於減法可藉由補數的加法解決,乘法等於連續的加法,除法則是連續的減法,可見加法器在運算數位系統中的重要性 ,但它也不過是幾個邏輯閘就解決的電路。
#include <stdio.h>
#include <limits.h>
int v;
int sign;
int main(){
// sign is either -1 or 0
sign = -(v < 0);
// to avoid branching
sign = -(int)((unsigned int)((int)v) >> (sizeof(int) * CHAR_BIT - 1));
// less instruction but not portable
sign = v >> (sizeof(int) * CHAR_BIT - 1);
// sign is either -1 or +1
sign = +1 | (v >> sizeof(int) * CHAR_BIT - 1);
// sign is either -1, 0, or +1
sign = (v != 0) | (v >> (sizeof(int) * CHAR_BIT - 1));
// for portability, brevity, and perhaps speed
sign = (v > 0) - (v < 0);
// sign is either 0 or 1
sign = 1 ^ ((unsigned int)((int)v) >> (sizeof(int) * CHAR_BIT - 1));
return 0;
}
int x, y; // input values to compare signs
bool f = ((x ^ y) < 0); // true iff x and y have opposite signs
Rule:
^
the left most digit
int v; // we want to find the absolute value of v
unsigned int r; // the result goes here
int const mask = v >> sizeof(int) * CHAR_BIT - 1;
r = (v + mask) ^ mask;
// Patented variation:
r = (v ^ mask) - mask;
Rule:
int x; // we want to find the minimum of x and y
int y;
int r; // the result goes here
r = y ^ ((x ^ y) & -(x < y)); // min(x, y)
r = x ^ ((x ^ y) & -(x < y)); // max(x, y)
// Quick and dirty versions:
// INT_MIN <= x - y <= INT_MAX
// (x - y) only needs to be evaluated once.
r = y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
r = x - ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // max(x, y)
Rule:
unsigned int v; // we want to see if v is a power of 2
bool f; // the result goes here
f = v && !(v & (v - 1));
e.g.
4 & 3 == 0
3 & 2 != 0
Sometimes we need to extend the sign of a number but we don't know a priori the number of bits, b, in which it is represented.
unsigned b; // number of bits representing the number in x
int x; // sign extend this b-bit number to r
int r; // resulting sign-extended number
int const m = 1U << (b - 1); // mask can be pre-computed if b is fixed
x = x & ((1U << b) - 1); // (Skip this if bits in x above position b are already zero.)
r = (x ^ m) - m;
// A slightly faster but less portable method:
int const m = CHAR_BIT * sizeof(x) - b;
r = (x << m) >> m;
bool f; // conditional flag
unsigned int m; // the bit mask
unsigned int w; // the word to modify: if (f) w |= m; else w &= ~m;
w ^= (-f ^ w) & m;
// OR, for superscalar CPUs:
w = (w & ~m) | (-f & m);
If you need to negate only when a flag is false, then use the following to avoid branching.
bool fDontNegate; // Flag indicating we should not negate v.
int v; // Input value to negate if fDontNegate is false.
int r; // result = fDontNegate ? v : -v;
r = (fDontNegate ^ (fDontNegate - 1)) * v;
If you need to negate only when a flag is true, then use this.
bool fNegate; // Flag indicating if we should negate v.
int v; // Input value to negate if fNegate is true.
int r; // result = fNegate ? -v : v;
r = (v ^ -fNegate) + fNegate;
unsigned int a; // value to merge in non-masked bits
unsigned int b; // value to merge in masked bits
unsigned int mask; // 1 where bits from b should be selected; 0 where from a.
unsigned int r; // result of (a & ~mask) | (b & mask) goes here
r = a ^ ((a ^ b) & mask);
unsigned int v; // count the number of bits set in v
unsigned int c; // c accumulates the total bits set in v
for (c = 0; v; v >>= 1)
{
c += v & 1;
}
unsigned int v; // count the number of bits set in v
unsigned int c; // c accumulates the total bits set in v
for (c = 0; v; c++)
{
v &= v - 1; // clear the least significant bit set
}
The following finds the the rank of a bit, meaning it returns the sum of bits that are set to 1 from the most-signficant bit downto the bit at the given position.
uint64_t v; // Compute the rank (bits set) in v from the MSB to pos.
unsigned int pos; // Bit position to count bits upto.
uint64_t r; // Resulting rank of bit at pos goes here.
// Shift out bits after given position.
r = v >> (sizeof(v) * CHAR_BIT - pos);
r = r - ((r >> 1) & ~0UL/3);
r = (r & ~0UL/5) + ((r >> 2) & ~0UL/5);
r = (r + (r >> 4)) & ~0UL/17;
r = (r * (~0UL/255)) >> ((sizeof(v) - 1) * CHAR_BIT);
The following 64-bit code selects the position of the rth 1 bit when counting from the left. In other words if we start at the most significant bit and proceed to the right, counting the number of bits set to 1 until we reach the desired rank, r, then the position where we stop is returned. If the rank requested exceeds the count of bits set, then 64 is returned. The code may be modified for 32-bit or counting from the right. If branching is fast on your target CPU, consider uncommenting the if-statements and commenting the lines that follow them.
uint64_t v; // Input value to find position with rank r.
unsigned int r; // Input: bit's desired rank [1-64].
unsigned int s; // Output: Resulting position of bit with rank r [1-64]
uint64_t a, b, c, d; // Intermediate temporaries for bit count.
unsigned int t; // Bit count temporary.
a = v - ((v >> 1) & ~0UL/3);
b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
c = (b + (b >> 4)) & ~0UL/0x11;
d = (c + (c >> 8)) & ~0UL/0x101;
t = (d >> 32) + (d >> 48);
// Now do branchless select!
s = 64;
// if (r > t) {s -= 32; r -= t;}
s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
t = (d >> (s - 16)) & 0xff;
// if (r > t) {s -= 16; r -= t;}
s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
t = (c >> (s - 8)) & 0xf;
// if (r > t) {s -= 8; r -= t;}
s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
t = (b >> (s - 4)) & 0x7;
// if (r > t) {s -= 4; r -= t;}
s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
t = (a >> (s - 2)) & 0x3;
// if (r > t) {s -= 2; r -= t;}
s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
t = (v >> (s - 1)) & 0x1;
// if (r > t) s--;
s -= ((t - r) & 256) >> 8;
s = 65 - s;
A parity bit, or check bit, is a bit added to a string of binary code to ensure that the total number of 1-bits in the string is even or odd. Parity bits are used as the simplest form of error detecting code.
unsigned int v; // word value to compute the parity of
bool parity = false; // parity will be the parity of v
while (v)
{
parity = !parity;
v = v & (v - 1);
}
This swaps the values of a and b without using a temporary variable. Don't use this with floating-point numbers (unless you operate on their raw integer representations).
#define SWAP(a, b) ((&(a) == &(b)) || \
(((a) -= (b)), ((b) += (a)), ((a) = (b) - (a))))
This is an old trick to exchange the values of the variables a and b without using extra space for a temporary variable.
#define SWAP(a, b) (((a) == (b)) || (((a) ^= (b)), ((b) ^= (a)), ((a) ^= (b))))
As an example of swapping ranges of bits suppose we have have b = 00101111 (expressed in binary) and we want to swap the n = 3 consecutive bits starting at i = 1 (the second bit from the right) with the 3 consecutive bits starting at j = 5; the result would be r = 11100011 (binary).
unsigned int i, j; // positions of bit sequences to swap
unsigned int n; // number of consecutive bits in each sequence
unsigned int b; // bits to swap reside in b
unsigned int r; // bit-swapped result goes here
unsigned int x = ((b >> i) ^ (b >> j)) & ((1U << n) - 1); // XOR temporary
r = b ^ ((x << i) | (x << j));
unsigned int v; // input bits to be reversed
unsigned int r = v; // r will be reversed bits of v; first get LSB of v
int s = sizeof(v) * CHAR_BIT - 1; // extra shift needed at end
for (v >>= 1; v; v >>= 1)
{
r <<= 1;
r |= v & 1;
s--;
}
r <<= s; // shift when v's highest bits are zero
const unsigned int n; // numerator
const unsigned int s;
const unsigned int d = 1U << s; // So d will be one of: 1, 2, 4, 8, 16, 32, ...
unsigned int m; // m will be n % d
m = n & (d - 1);
unsigned int n; // numerator
const unsigned int s; // s > 0
const unsigned int d = (1 << s) - 1; // so d is either 1, 3, 7, 15, 31, ...).
unsigned int m; // n % d goes here.
for (m = n; n > d; n = m)
{
for (m = 0; n; n >>= s)
{
m += n & d;
}
}
// Now m is a value from 0 to d, but since with modulus division
// we want m to be 0 when it is d.
m = m == d ? 0 : m;
The log base 2 of an integer is the same as the position of the highest bit set (or most significant bit set, MSB).
unsigned int v; // 32-bit word to find the log base 2 of
unsigned int r = 0; // r will be lg(v)
while (v >>= 1) // unroll for more speed...
{
r++;
}
unsigned int v; // 32-bit value to find the log2 of
const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
const unsigned int S[] = {1, 2, 4, 8, 16};
int i;
register unsigned int r = 0; // result of log2(v) will go here
for (i = 4; i >= 0; i--) // unroll for speed...
{
if (v & b[i])
{
v >>= S[i];
r |= S[i];
}
}
// OR (IF YOUR CPU BRANCHES SLOWLY):
unsigned int v; // 32-bit value to find the log2 of
register unsigned int r; // result of log2(v) will go here
register unsigned int shift;
r = (v > 0xFFFF) << 4; v >>= r;
shift = (v > 0xFF ) << 3; v >>= shift; r |= shift;
shift = (v > 0xF ) << 2; v >>= shift; r |= shift;
shift = (v > 0x3 ) << 1; v >>= shift; r |= shift;
r |= (v >> 1);
uint32_t v; // find the log base 2 of 32-bit v
int r; // result goes here
static const int MultiplyDeBruijnBitPosition[32] =
{
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
};
v |= v >> 1; // first round down to one less than a power of 2
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
r = MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
The integer log base 10 is computed by first using one of the techniques above for finding the log base 2. By the relationship log10(v) = log2(v) / log2(10), we need to multiply it by 1/log2(10), which is approximately 1233/4096, or 1233 followed by a right shift of 12. Adding one is needed because the IntegerLogBase2 rounds down. Finally, since the value t is only an approximation that may be off by one, the exact value is found by subtracting the result of v < PowersOf10[t].
unsigned int v; // non-zero 32-bit integer value to compute the log base 10 of
int r; // result goes here
int t; // temporary
static unsigned int const PowersOf10[] =
{1, 10, 100, 1000, 10000, 100000,
1000000, 10000000, 100000000, 1000000000};
t = (IntegerLogBase2(v) + 1) * 1233 >> 12; // (use a lg2 method from above)
r = t - (v < PowersOf10[t]);
This method works well when the input is uniformly distributed over 32-bit values because 76% of the inputs are caught by the first compare, 21% are caught by the second compare, 2% are caught by the third, and so on (chopping the remaining down by 90% with each comparision). As a result, less than 2.6 operations are needed on average.
unsigned int v; // non-zero 32-bit integer value to compute the log base 10 of
int r; // result goes here
r = (v >= 1000000000) ? 9 : (v >= 100000000) ? 8 : (v >= 10000000) ? 7 :
(v >= 1000000) ? 6 : (v >= 100000) ? 5 : (v >= 10000) ? 4 :
(v >= 1000) ? 3 : (v >= 100) ? 2 : (v >= 10) ? 1 : 0;
unsigned int v; // input to count trailing zero bits
int c; // output: c will count v's trailing zero bits,
// so if v is 1101000 (base 2), then c will be 3
if (v)
{
v = (v ^ (v - 1)) >> 1; // Set v's trailing 0s to 1s and zero rest
for (c = 0; v; c++)
{
v >>= 1;
}
}
else
{
c = CHAR_BIT * sizeof(v);
}
unsigned int v; // 32-bit word input to count zero bits on right
unsigned int c; // c will be the number of zero bits on the right,
// so if v is 1101000 (base 2), then c will be 3
// NOTE: if 0 == v, then c = 31.
if (v & 0x1)
{
// special case for odd v (assumed to happen half of the time)
c = 0;
}
else
{
c = 1;
if ((v & 0xffff) == 0)
{
v >>= 16;
c += 16;
}
if ((v & 0xff) == 0)
{
v >>= 8;
c += 8;
}
if ((v & 0xf) == 0)
{
v >>= 4;
c += 4;
}
if ((v & 0x3) == 0)
{
v >>= 2;
c += 2;
}
c -= v & 0x1;
}
It makes use of the fact that the first 32 bit position values are relatively prime with 37, so performing a modulus division with 37 gives a unique number from 0 to 36 for each. These numbers may then be mapped to the number of zeros using a small lookup table.
unsigned int v; // find the number of trailing zeros in v
int r; // put the result in r
static const int Mod37BitPosition[] = // map a bit value mod 37 to its position
{
32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13, 4,
7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9, 5,
20, 8, 19, 18
};
r = Mod37BitPosition[(-v & v) % 37];
Converting bit vectors to indices of set bits is an example use for this. It requires one more operation than the earlier one involving modulus division, but the multiply may be faster. The expression (v & -v) extracts the least significant 1 bit from v. The constant 0x077CB531UL is a de Bruijn sequence, which produces a unique pattern of bits into the high 5 bits for each possible bit position that it is multiplied against. When there are no bits set, it returns 0.
unsigned int v; // find the number of trailing zeros in 32-bit v
int r; // result goes here
static const int MultiplyDeBruijnBitPosition[32] =
{
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
};
r = MultiplyDeBruijnBitPosition[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];
The time to convert an integer to a float can be high on some machines. The exponent of the 32-bit IEEE floating point representation is shifted down, and the bias is subtracted to give the position of the least significant 1 bit set in v. If v is zero, then the result is -127.
unsigned int v; // find the number of trailing zeros in v
int r; // the result goes here
float f = (float)(v & -v); // cast the least significant bit in v to a float
r = (*(uint32_t *)&f >> 23) - 0x7f;
e.g.
Scientific notation is
IEEE 754 floating-point standard is
32-bit IEEE floating point representation is
Sign | Exponent | Mantissa |
---|---|---|
+ | 2 + 127 | 1 |
0 | 10000001 | 1000000 00000000 00000000 |
unsigned int const v; // Round this 32-bit value to the next highest power of 2
unsigned int r; // Put the result here. (So v=3 -> r=4; v=8 -> r=8)
if (v > 1)
{
float f = (float)v;
unsigned int const t = 1U << ((*(unsigned int *)&f >> 23) - 0x7f);
r = t << (t < v);
}
else
{
r = 1;
}
The result may be expressed by the formula 1U << (lg(v - 1) + 1).
(It would be faster by 2 operations to use the formula and the log base 2 method that uses a lookup table, but in some situations, lookup tables are not suitable.)
It works by copying the highest set bit to all of the lower bits, and then adding one, which results in carries that set all of the lower bits to 0 and one bit beyond the highest set bit to 1. If the original number was a power of 2, then the decrement will reduce it to one less, so that we round up to the same original value.
unsigned int v; // compute the next highest power of 2 of 32-bit v
int main(){
v = 0;
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
v += (v == 0);
return 0;
}
Interleaved bits (aka Morton numbers) are useful for linearizing 2D integer coordinates, so x and y are combined into a single number that can be compared easily and has the property that a number is usually close to another if their x and y values are close.
e.g.
If you want to convert a certain set of integer coordinates to a Morton code, you have to convert the decimal values to binary and interleave the bits of each coordinate:
(x,y) = (5,9) = (0101,1001)
Interleaving the bits results in: 10010011 = 147
unsigned short x; // Interleave bits of x and y, so that all of the
unsigned short y; // bits of x are in the even positions and y in the odd;
unsigned int z = 0; // z gets the resulting Morton Number.
for (int i = 0; i < sizeof(x) * CHAR_BIT; i++) // unroll for more speed...
{
z |= (x & 1U << i) << i | (y & 1U << i) << (i + 1);
}
static const unsigned int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
static const unsigned int S[] = {1, 2, 4, 8};
unsigned int x; // Interleave lower 16 bits of x and y, so the bits of x
unsigned int y; // are in the even positions and bits from y in the odd;
unsigned int z; // z gets the resulting 32-bit Morton Number.
// x and y must initially be less than 65536.
x = (x | (x << S[3])) & B[3];
x = (x | (x << S[2])) & B[2];
x = (x | (x << S[1])) & B[1];
x = (x | (x << S[0])) & B[0];
y = (y | (y << S[3])) & B[3];
y = (y | (y << S[2])) & B[2];
y = (y | (y << S[1])) & B[1];
y = (y | (y << S[0])) & B[0];
z = x | (y << 1);
Suppose we have a pattern of N bits set to 1 in an integer and we want the next permutation of N 1 bits in a lexicographical sense.
For example, if N is 3 and the bit pattern is 00010011, the next patterns would be 00010101, 00010110, 00011001,00011010, 00011100, 00100011, and so forth. The following is a fast way to compute the next permutation.
unsigned int v; // current permutation of bits
unsigned int w; // next permutation of bits
unsigned int t = v | (v - 1); // t gets v's least significant 0 bits set to 1
// Next set to 1 the most significant bit to change,
// set to 0 the least significant ones, and add the necessary 1 bits.
w = (t + 1) | (((~t & -~t) - 1) >> (__builtin_ctz(v) + 1));
The __builtin_ctz(v)
GNU C compiler intrinsic for x86 CPUs returns the number of trailing zeros.
unsigned int t = (v | (v - 1)) + 1;
w = t | ((((t & -t) / (v & -v)) >> 1) - 1);
// Fewer operations:
unsigned int v; // 32-bit word to check if any 8-bit byte in it is 0
bool hasZeroByte = ~((((v & 0x7F7F7F7F) + 0x7F7F7F7F) | v) | 0x7F7F7F7F);
// More operations:
bool hasNoZeroByte = ((v & 0xff) && (v & 0xff00) && (v & 0xff0000) && (v & 0xff000000))
// OR:
unsigned char * p = (unsigned char *) &v;
bool hasNoZeroByte = *p && *(p + 1) && *(p + 2) && *(p + 3);
There is yet a faster method — use hasless(v, 1), which is defined below.
#define haszero(v) (((v) - 0x01010101UL) & ~(v) & 0x80808080UL)
We may want to know if any byte in a word has a specific value. To do so, we can XOR the value to test with a word that has been filled with the byte values in which we're interested. Because XORing a value with itself results in a zero byte and nonzero otherwise, we can pass the result to haszero.
#define hasvalue(x,n) \
(haszero((x) ^ (~0UL/255 * (n))))
Requirements: x>=0; 0<=n<=128
#define hasless(x,n) (((x)-~0UL/255*(n))&~(x)&~0UL/255*128)
To count the number of bytes in x
#define countless(x,n) \
(((~0UL/255*(127+(n))-((x)&~0UL/255*127))&~(x)&~0UL/255*128)/128%255)
Requirements: x>=0; 0<=n<=127
#define hasmore(x,n) (((x)+~0UL/255*(127-(n))|(x))&~0UL/255*128)
To count the number of bytes in x
#define countmore(x,n) \
(((((x)&~0UL/255*127)+~0UL/255*(127-(n))|(x))&~0UL/255*128)/128%255)
When m < n, this technique tests if a word x contains an unsigned byte value, such that m < value < n.
Requirements: x>=0; 0<=m<=127; 0<=n<=128
#define hasbetween(x,m,n) \
((~0UL/255*(127+(n))-((x)&~0UL/255*127)&~(x)&((x)&~0UL/255*127)+~0UL/255*(127-(m)))&~0UL/255*128)
To count the number of bytes in x that are between m and n (exclusive)
#define countbetween(x,m,n) (hasbetween(x,m,n)/128%255)