# 2022q1 Homework2 (quiz2) contributed by < yaohwang99 > > [測驗題](https://hackmd.io/@sysprog/linux2022-quiz2) ## Problem1 1. Calulate ${\lfloor}a\div2{\rfloor}$ and ${\lfloor}b\div2{\rfloor}$, if $a$ and $b$ are both odd number, then we need to add $1$ to the result. ==EXP1: a & b & 1== c #include <stdint.h> uint32_t average(uint32_t a, uint32_t b) { return (a >> 1) + (b >> 1) + (EXP1); }  2. Retain what $a$ and $b$ has in common and divide the rest with $2$. ==EXP2: a & b== ==EXP3: a ^ b== c uint32_t average(uint32_t a, uint32_t b) { return (EXP2) + ((EXP3) >> 1); }  ## Problem 2 Here we can look at some properties: + x ^ x is equivalent to 0 + x ^ 0 is equivalent to x + x ^ (x ^ y) is equivalent to (x ^ x) ^ y and is equivalent to y + x & ~0 is equivalent to x, x & 0 is equivalent to 0 We would like to return a when a > b and return b otherwise. 1. ((EXP4) & -(EXP5)) is 0 when a <= b, therefore, assume ==EXP5 is a > b== 2. If EXP5 is a > b , then when a > b, the return value a ^ ((EXP4) & 1) equals b, therefore, ==EXP4 is a ^ b== c #include <stdint.h> uint32_t max(uint32_t a, uint32_t b) { return a ^ ((EXP4) & -(EXP5)); }  ## Problem 3 c= #include <stdint.h> uint64_t gcd64(uint64_t u, uint64_t v) { if (!u || !v) return u | v; int shift; for (shift = 0; !((u | v) & 1); shift++) { u /= 2, v /= 2; } while (!(u & 1)) u /= 2; do { while (!(v & 1)) v /= 2; if (u < v) { v -= u; } else { uint64_t t = u - v; u = v; v = t; } } while (COND); return RET; }  1. If both x and y are 0, $gcd(0,0)=0$. 2. $gce(x,0)=x$ and $gcd(0,y)=y$ because everything divides 0. c=4 if (!u || !v) return u | v;  3. If x and y are both even, $gcd(x,y)=2*gcd(x/2,y/2)$ because 2 is a common divisor. c=5 int shift; for (shift = 0; !((u | v) & 1); shift++) { u /= 2, v /= 2; }  4. If x is even and y is odd, $gcd(x,y)=gcd(x/2,y)$, and vice versa. c=9 while (!(u & 1)) u /= 2; do { while (!(v & 1)) v /= 2;  5. Implement Euclidean algorithm 1. When v > u, v -= u, and vice versa. 2. Return RET when v == u ==COND: v== ==RET: u << shift== c=13 if (u < v) { v -= u; } else { uint64_t t = u - v; u = v; v = t; } } while (COND); return RET;  ## Problem 4 out stores the index of each 1 in the bit map, for example, if we use 3 unsigned 8 bit integer as an array, then out == {0, 6, 9, 12, 18, 19, 23} and pos is 7 graphviz graph { node[shape = record] node1[label = "{10000010|01001000|00110001}"] }  [Bit Hack #7](https://catonmat.net/low-level-bit-hacks) shows how to isolate the rightmost 1-bit. ==EXP6: -x | x== c #include <stddef.h> size_t improved(uint64_t *bitmap, size_t bitmapsize, uint32_t *out) { size_t pos = 0; uint64_t bitset; for (size_t k = 0; k < bitmapsize; ++k) { bitset = bitmap[k]; while (bitset != 0) { uint64_t t = EXP6; int r = __builtin_ctzll(bitset); out[pos++] = k * 64 + r; bitset ^= t; } } return pos; }  ## Problem 5 Consider the example where + n = 197 + d = 84 + remainder = 29 + division = 2 1. Insert the division graphviz digraph { rankdir=LR p[shape = plaintext] decimal[shape = plaintext] q[shape = plaintext] result[shape = plaintext] node1[shape = record, label = "{<h>2|<dot>.|<n>}"] result->node1:h p->node1:n node2[shape = record, label = "{<h>0|0|0|0|...|0}"] decimal->node2:h q->node2:h }  2. Insert decimal until repeated remainder appear. ==MMM: list_add== ==EEE: &heads[remainder % size]== graphviz digraph { rankdir=LR p[shape = plaintext] decimal[shape = plaintext] q[shape = plaintext] result[shape = plaintext] node1[shape = record, label = "{<h>2|<dot>.|<n>}"] result->node1:h p->node1:n node2[shape = record, label = "{<h>3|4|5|2|3|8|0|9|5|<n>0|0|...|0}"] decimal->node2:h q->node2:n[color = red] }  3. Now we know the decimal is 345238095, the question is where to put the parenthesis. Find the index of the repeated value. In this case, it is 3. We insert ( before the target and insert ) at the end of the string. ==PPP: pos--== graphviz digraph { rankdir=LR p[shape = plaintext] decimal[shape = plaintext] q[shape = plaintext] result[shape = plaintext] node1[shape = record, label = "{<h>2|<dot>.|<n>3|4|5|(|2|3|8|0|9|5|)|<e>\\0}"] result->node1:h p->node1:e[color = red] node2[shape = record, label = "{<h>3|4|5|2|3|8|0|9|5|<n>0|0|...|0}"] decimal->node2:h q->node2:n }  ### Improvement The following code is an implementation without list. Note that: 1. Because the reamainder is always smaller than divisor, we can use divisor as hash table size. However, heap buffer overflow will occur when divisor is too big because there is no sufficient memory. 2. We must use 64 bits to store the dividend because when remainder is big enough, remainder * 10 will need more than 32 bits to store. c unsigned find(unsigned r, unsigned pos, unsigned *heads){ if (!heads[r]) { heads[r] = pos; return 0; } else { return heads[r]; } } char * fractionToDecimal(int numerator, int denominator){ char *ret = malloc(10000); long long n = numerator, d = denominator; unsigned r = 0; bool is_pos = true; char *p = ret; if(numerator < 0){ n = -n; is_pos = !is_pos; } if(denominator < 0){ d = -d; is_pos = !is_pos; } if(!is_pos && n){ *p++ = '-'; } sprintf(p, "%lld", n / d); r = n%d; n = r * 10; if(!n) return ret; p = ret + strlen(ret); *p++ = '.'; char *dec = malloc(10000); char *q = dec; unsigned *heads = malloc(d * sizeof(unsigned)); memset(heads, 0, d * sizeof(unsigned)); unsigned idx = 0; unsigned pos = 1; while(n){ idx = find(r, pos, heads); if(idx){ break; } r = n % d; *q++ = n/d + '0'; ++pos; n = r * 10; } *q = '\0'; for(pos = 1, q = dec; *q; ++pos){ if(pos == idx){ *p++ = '('; } *p++ = *q++; } if(idx) *p++ = ')'; *p = '\0'; free(dec); free(heads); return ret; }  ## Problem 6 [data alignment](https://hackmd.io/@sysprog/c-memory?type=view#data-alignment[]) If we declare a structure like this c struct s1 { char c; int a; }  Alignment will be done by default. The address of a will be 4 bytes after the address of c. Implement the characteristic into macro, we can get the result by subtracting the address of the two pointer. ==M:_h== ==X: 0== c /* * ALIGNOF - get the alignment of a type * @t: the type to test * * This returns a safe alignment for the given type. */ #define ALIGNOF(t) \ ((char *)(&((struct { char c; t _h; } *)0)->M) - (char *)X)  graphviz digraph { rankdir = LR node1[shape = record, label = "<0>0x0||...||<4>_h|||..."] c[shape = plaintext, label = "&c"] h[shape = plaintext, label = "&((&c)->_h)"] c->node1:0 h->node1:4 }  ## Problem 7 From [Faster Remainder by Direct Computation](https://arxiv.org/abs/1902.01961), + $Proposition \ 1: Given \ d\in[1,2^N), \ and \ non-negative \\ integers \ c, \ L \ such \ that \ 2^{N+L}\leq c*d\leq 2^{N+L}+2^L, \\ then \ given \ some \ n\in[0,2^N), \ we \ have \ that$ ==$d \ divides \ n \ if \ and \ only \ if \ (c*n) \mod2^{N+L}<c$== + $c=\lceil\frac{2^{N+L}}{d}\rceil$ In our case, $N=L=32,\ 2^{64}\leq\lceil\frac{2^{64}}{d}\rceil*d\leq\frac{2^{64}}{d}+d\leq 2^{64}+2^{32}$ for $d\in[1,2^{32})$ $c=\lceil\frac{2^{64}}{d}\rceil=\lfloor\frac{2^{64}}{d}+\frac{d-1}{d}\rfloor=\lfloor\frac{2^{64}-1}{d}\rfloor+1$ c static uint64_t M3 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 3 + 1; static uint64_t M5 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 5 + 1;  n * M will be implicitly converted to uint32_t, which is the 32 less significant bit, which equals to $(c*n)\mod2^{32}$. Therefore, n * M <= M - 1 is the same as $(c*n) \mod2^{N+L}<c$ from proposition 1. c static inline bool is_divisible(uint32_t n, uint64_t M) { return n * M <= M - 1; }  :::info M3 can also be interpreted as the decimal part of $1/3$ shifted left 64 bits. Which will be 01010101...~2~ M5 can be interpreted as the decimal part of $1/5$ shifted left 64 bits. Which will be 001100110011...~2~ The 64 least significant bits is the decimal part of the quotient. ::: |number|start|length| |-|-|-| 3|0|4| 5|4|4| 15|0|8| none|8|2| From the above tabel, we can easily deduct that ==KK1: div3==, ==KK2: div5==. For the starting position, we can see that when the number divides 3, the starting position is always zero, so we need to shift more then 1 to clear every bit. Therefore, ==KK3: div3 << 2==. :::danger Note that in line 17, the default starting index should be 8, not 9. ::: c= static inline bool is_divisible(uint32_t n, uint64_t M) { return n * M <= M - 1; } static uint64_t M3 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 3 + 1; static uint64_t M5 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 5 + 1; int main(int argc, char **argv) { for (size_t i = 1; i <= 100; i++) { uint8_t div3 = is_divisible(i, M3); uint8_t div5 = is_divisible(i, M5); unsigned int length = (2 << KK1) << KK2; char fmt[9]; strncpy(fmt, &"FizzBuzz%u"[(8 >> div5) >> (KK3)], length); fmt[length] = '\0'; printf(fmt, i); printf("\n"); } return 0; }  To avoid printf, I modified the code and compare result using perf. :::spoiler code c #include<stdio.h> #include<stdbool.h> #include<stdint.h> #include<string.h> #include<unistd.h> static uint64_t M3 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 3 + 1; static uint64_t M5 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 5 + 1; int main(int argc, char **argv) { for (uint32_t i = 1; i <= 100; i++) { uint8_t div3 = i * M3 <= M3 - 1; uint8_t div5 = i * M5 <= M5 - 1; unsigned int length = (2 << div3) << div5; char fmt[9],out[9]; strncpy(fmt, &"FizzBuzz%u"[(8 >> div5) >> (div3 << 2)], length); fmt[length] = '\0'; } return 0; }  c #include <stdio.h> #include<unistd.h> #include<string.h> int main() { for (unsigned int i = 1; i <= 100; i++) { char fmt[9]; if (i % 3 == 0){ strncpy(fmt,"Fizz",4); fmt[4]='\0'; } if (i % 5 == 0){ strncpy(fmt,"Buzz",4); fmt[4]='\0'; } if (i % 15 == 0){ strncpy(fmt,"FizzBuzz",8); fmt[8]='\0'; } if ((i % 3) && (i % 5)) { strncpy(fmt,"%u", 2); fmt[2]='\0'; } } return 0; }  :::  Performance counter stats for './bitwise' (100 runs): 0.23 msec task-clock # 0.572 CPUs utilized ( +- 0.88% ) 0 context-switches # 42.831 /sec ( +-100.00% ) 0 cpu-migrations # 0.000 /sec 47 page-faults # 203.148 K/sec ( +- 0.27% ) 717,390 cycles # 3.073 GHz ( +- 0.88% ) 659,489 instructions # 0.92 insn per cycle ( +- 0.20% ) 128,091 branches # 548.630 M/sec ( +- 0.19% ) 3,806 branch-misses # 2.97% of all branches ( +- 1.34% ) 0.00040795 +- 0.00000406 seconds time elapsed ( +- 1.00% )   Performance counter stats for './naive' (100 runs): 0.20 msec task-clock # 0.568 CPUs utilized ( +- 0.65% ) 0 context-switches # 0.000 /sec 0 cpu-migrations # 0.000 /sec 46 page-faults # 228.637 K/sec ( +- 0.20% ) 682,379 cycles # 3.370 GHz ( +- 0.65% ) 649,553 instructions # 0.95 insn per cycle ( +- 0.19% ) 125,643 branches # 620.580 M/sec ( +- 0.19% ) 3,553 branch-misses # 2.83% of all branches ( +- 1.45% ) 0.00035664 +- 0.00000320 seconds time elapsed ( +- 0.90% )  Notice that the naive version is not slower. Look at the assembly code, if(i % 3) is:  movl -4(%rbp), %ecx movl %ecx, %edx movl $2863311531, %eax imulq %rdx, %rax shrq$32, %rax shrl %eax movl %eax, %edx addl %edx, %edx addl %eax, %edx movl %ecx, %eax subl %edx, %eax testl %eax, %eax jne .L3  2863311531 is 0xAAAAAAB, which is $\lceil2^{33}/3\rceil$, which means that compiler converts division to multiplication just like we did. Furthermore, the naive version does not call strncpy but the bitwise version does strncpy(fmt,"FizzBuzz",8)is converted to  leaq -13(%rbp), %rax movabsq $8825495348169042246, %rsi movq %rsi, (%rax)  strncpy(fmt, &"FizzBuzz%u"[(8 >> div5) >> (div3 << 2)], length)is converted to  movl -12(%rbp), %edx movzbl -6(%rbp), %eax movl$8, %esi movl %eax, %ecx sarl %cl, %esi movzbl -5(%rbp), %eax sall \$2, %eax movl %eax, %ecx sarl %cl, %esi movl %esi, %eax cltq leaq .LC0(%rax), %rcx leaq -21(%rbp), %rax movq %rcx, %rsi movq %rax, %rdi call strncpy