2022q1 Homework2 (quiz2)

contributed by < yaohwang99 >

測驗題

Problem`1`

Calulate
$⌊ a \div 2 ⌋$ and
$⌊ b \div 2 ⌋$ , if
$a$ and
$b$ are both odd number, then we need to add
$1$ to the result.
EXP1: a & b & 1

#include <stdint.h>
uint32_t average(uint32_t a, uint32_t b)
{
    return (a >> 1) + (b >> 1) + (EXP1);
}

Retain what
$a$ and
$b$ has in common and divide the rest with
$2$ .
EXP2: a & b
EXP3: a ^ b

uint32_t average(uint32_t a, uint32_t b)
{
    return (EXP2) + ((EXP3) >> 1);
}

Problem `2`

Here we can look at some properties:

x ^ x is equivalent to 0
x ^ 0 is equivalent to x
x ^ (x ^ y) is equivalent to (x ^ x) ^ y and is equivalent to y
x & ~0 is equivalent to x, x & 0 is equivalent to 0

We would like to return a when a > b and return b otherwise.

((EXP4) & -(EXP5)) is 0 when a <= b, therefore, assume EXP5 is a > b
If EXP5 is a > b , then when a > b, the return value a ^ ((EXP4) & 1) equals b, therefore, EXP4 is a ^ b

#include <stdint.h>
uint32_t max(uint32_t a, uint32_t b)
{
    return a ^ ((EXP4) & -(EXP5));
}

Problem `3`























#include <stdint.h>
uint64_t gcd64(uint64_t u, uint64_t v)
{
    if (!u || !v) return u | v;
    int shift;
    for (shift = 0; !((u | v) & 1); shift++) {
        u /= 2, v /= 2;
    }
    while (!(u & 1))
        u /= 2;
    do {
        while (!(v & 1))
            v /= 2;
        if (u < v) {
            v -= u;
        } else {
            uint64_t t = u - v;
            u = v;
            v = t;
        }
    } while (COND);
    return RET;
}

If both x and y are 0,
$g c d (0, 0) = 0$ .
$g c e (x, 0) = x$ and
$g c d (0, y) = y$ because everything divides 0.


    if (!u || !v) return u | v;

If x and y are both even,
$g c d (x, y) = 2 * g c d (x / 2, y / 2)$ because 2 is a common divisor.




    int shift;
    for (shift = 0; !((u | v) & 1); shift++) {
        u /= 2, v /= 2;
    }

If x is even and y is odd,
$g c d (x, y) = g c d (x / 2, y)$ , and vice versa.





while (!(u & 1))
        u /= 2;
    do {
        while (!(v & 1))
            v /= 2;

Implement Euclidean algorithm
1. When v > u, v -= u, and vice versa.
2. Return RET when v == u

COND: v
RET: u << shift









        if (u < v) {
            v -= u;
        } else {
            uint64_t t = u - v;
            u = v;
            v = t;
        }
    } while (COND);
    return RET;

Problem `4`

out stores the index of each 1 in the bit map, for example, if we use 3 unsigned 8 bit integer as an array, then out == {0, 6, 9, 12, 18, 19, 23} and pos is 7

Bit Hack #7 shows how to isolate the rightmost 1-bit.
EXP6: -x | x

#include <stddef.h>
size_t improved(uint64_t *bitmap, size_t bitmapsize, uint32_t *out)
{
    size_t pos = 0;
    uint64_t bitset;
    for (size_t k = 0; k < bitmapsize; ++k) {
        bitset = bitmap[k];
        while (bitset != 0) {
            uint64_t t = EXP6;
            int r = __builtin_ctzll(bitset);
            out[pos++] = k * 64 + r;
            bitset ^= t;                           
        }
    }
    return pos;
}

Problem `5`

Consider the example where

n = 197
d = 84
remainder = 29
division = 2

Insert the division

Insert decimal until repeated remainder appear.
MMM: list_add
EEE: &heads[remainder % size]

Now we know the decimal is 345238095, the question is where to put the parenthesis.
Find the index of the repeated value. In this case, it is 3. We insert ( before the target and insert ) at the end of the string.
PPP: pos--

Improvement

The following code is an implementation without list.
Note that:

Because the reamainder is always smaller than divisor, we can use divisor as hash table size. However, heap buffer overflow will occur when divisor is too big because there is no sufficient memory.
We must use 64 bits to store the dividend because when remainder is big enough, remainder * 10 will need more than 32 bits to store.

unsigned find(unsigned r, unsigned pos, unsigned *heads){
    if (!heads[r]) {
        heads[r] = pos;
        return 0;
    } else {
        return heads[r];
    }
}

char * fractionToDecimal(int numerator, int denominator){
    char *ret = malloc(10000);
    
    long long n = numerator, d = denominator;
    unsigned r = 0;
    bool is_pos = true;
    char *p = ret;
    if(numerator < 0){
        n = -n;
        is_pos = !is_pos;
    }
        
    if(denominator < 0){
        d = -d;
        is_pos = !is_pos;
    }
    if(!is_pos && n){
        *p++ = '-';
    }
    
    sprintf(p, "%lld", n / d);
    r = n%d;
    n = r * 10;
    if(!n)
        return ret;
    p = ret + strlen(ret);
    *p++ = '.';
    
    char *dec = malloc(10000);
    char *q = dec;
    unsigned *heads = malloc(d * sizeof(unsigned));
    memset(heads, 0, d * sizeof(unsigned));
    
    unsigned idx = 0;
    unsigned pos = 1;
    
    while(n){
        idx = find(r, pos, heads);
        if(idx){
            break;
        }
        r = n % d;
        *q++ = n/d + '0';
        ++pos;
        n = r * 10;
    }
    *q = '\0';

    for(pos = 1, q = dec; *q; ++pos){
        if(pos == idx){
            *p++ = '(';
        }
        *p++ = *q++;
    }
    if(idx)
        *p++ = ')';
    *p = '\0';
    free(dec);
    free(heads);
    return ret;
}

Problem `6`

data alignment
If we declare a structure like this

struct s1 {
    char c;
    int a;
}

Alignment will be done by default. The address of a will be 4 bytes after the address of c.

Implement the characteristic into macro, we can get the result by subtracting the address of the two pointer.
M:_h
X: 0

/*
 * ALIGNOF - get the alignment of a type
 * @t: the type to test
 *
 * This returns a safe alignment for the given type.
 */
#define ALIGNOF(t) \
    ((char *)(&((struct { char c; t _h; } *)0)->M) - (char *)X)

Problem `7`

From Faster Remainder by Direct Computation,

$P r o p o s i t i o n 1 : G i v e n d \in [1, 2^{N}), a n d n o n - n e g a t i v e i n t e g e r s c, L s u c h t h a t 2^{N + L} \leq c * d \leq 2^{N + L} + 2^{L}, t h e n g i v e n s o m e n \in [0, 2^{N}), w e h a v e t h a t$
$d d i v i d e s n i f a n d o n l y i f (c * n) \mod 2^{N + L} < c$
$c = ⌈ \frac{2^{N + L}}{d} ⌉$

In our case,

N = L = 32, 2^{64} \leq ⌈ \frac{2^{64}}{d} ⌉ * d \leq \frac{2^{64}}{d} + d \leq 2^{64} + 2^{32}

for

d \in [1, 2^{32})

c = ⌈ \frac{2^{64}}{d} ⌉ = ⌊ \frac{2^{64}}{d} + \frac{d - 1}{d} ⌋ = ⌊ \frac{2^{64} - 1}{d} ⌋ + 1

static uint64_t M3 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 3 + 1;
static uint64_t M5 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 5 + 1;

n * M will be implicitly converted to uint32_t, which is the 32 less significant bit, which equals to

(c * n) \mod 2^{32}

.
Therefore, n * M <= M - 1 is the same as

(c * n) \mod 2^{N + L} < c

from proposition 1.

static inline bool is_divisible(uint32_t n, uint64_t M)
{
    return n * M <= M - 1;
}

M3 can also be interpreted as the decimal part of

1 / 3

shifted left 64 bits.
Which will be 01010101…₂
M5 can be interpreted as the decimal part of

1 / 5

shifted left 64 bits.
Which will be 001100110011…₂
The 64 least significant bits is the decimal part of the quotient.

number	start	length
3	0	4
5	4	4
15	0	8
none	8	2

From the above tabel, we can easily deduct that KK1: div3, KK2: div5.
For the starting position, we can see that when the number divides 3, the starting position is always zero, so we need to shift more then 1 to clear every bit. Therefore, KK3: div3 << 2.

Note that in line 17, the default starting index should be 8, not 9.
























static inline bool is_divisible(uint32_t n, uint64_t M)
{
    return n * M <= M - 1;
}

static uint64_t M3 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 3 + 1;
static uint64_t M5 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 5 + 1;

int main(int argc, char **argv)
{
    for (size_t i = 1; i <= 100; i++) {
        uint8_t div3 = is_divisible(i, M3);
        uint8_t div5 = is_divisible(i, M5);
        unsigned int length = (2 << KK1) << KK2;

        char fmt[9];
        strncpy(fmt, &"FizzBuzz%u"[(8 >> div5) >> (KK3)], length);
        fmt[length] = '\0';

        printf(fmt, i);
        printf("\n");
    }
    return 0;
}

To avoid printf, I modified the code and compare result using perf.

code

#include<stdio.h>
#include<stdbool.h>
#include<stdint.h>
#include<string.h>
#include<unistd.h>

static uint64_t M3 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 3 + 1;
static uint64_t M5 = UINT64_C(0xFFFFFFFFFFFFFFFF) / 5 + 1;

int main(int argc, char **argv)
{
    for (uint32_t i = 1; i <= 100; i++) {
        uint8_t div3 = i * M3 <= M3 - 1;
        uint8_t div5 = i * M5 <= M5 - 1;
        unsigned int length = (2 << div3) << div5;

        char fmt[9],out[9];
        strncpy(fmt, &"FizzBuzz%u"[(8 >> div5) >> (div3 << 2)], length);
        fmt[length] = '\0';
    }
    return 0;
}

#include <stdio.h>
#include<unistd.h>
#include<string.h>
int main() {
    
    for (unsigned int i = 1; i <= 100; i++) {
        char fmt[9];
        if (i % 3 == 0){
            strncpy(fmt,"Fizz",4);
            fmt[4]='\0';
        } 
        if (i % 5 == 0){
            strncpy(fmt,"Buzz",4);
            fmt[4]='\0';
        } 
        if (i % 15 == 0){
            strncpy(fmt,"FizzBuzz",8);
            fmt[8]='\0';
        } 
        if ((i % 3) && (i % 5)) {
            strncpy(fmt,"%u", 2);
            fmt[2]='\0';
        }
    }
    return 0;
}

 Performance counter stats for './bitwise' (100 runs):

              0.23 msec task-clock                #    0.572 CPUs utilized            ( +-  0.88% )
                 0      context-switches          #   42.831 /sec                     ( +-100.00% )
                 0      cpu-migrations            #    0.000 /sec                   
                47      page-faults               #  203.148 K/sec                    ( +-  0.27% )
           717,390      cycles                    #    3.073 GHz                      ( +-  0.88% )
           659,489      instructions              #    0.92  insn per cycle           ( +-  0.20% )
           128,091      branches                  #  548.630 M/sec                    ( +-  0.19% )
             3,806      branch-misses             #    2.97% of all branches          ( +-  1.34% )

        0.00040795 +- 0.00000406 seconds time elapsed  ( +-  1.00% )

 Performance counter stats for './naive' (100 runs):

              0.20 msec task-clock                #    0.568 CPUs utilized            ( +-  0.65% )
                 0      context-switches          #    0.000 /sec                   
                 0      cpu-migrations            #    0.000 /sec                   
                46      page-faults               #  228.637 K/sec                    ( +-  0.20% )
           682,379      cycles                    #    3.370 GHz                      ( +-  0.65% )
           649,553      instructions              #    0.95  insn per cycle           ( +-  0.19% )
           125,643      branches                  #  620.580 M/sec                    ( +-  0.19% )
             3,553      branch-misses             #    2.83% of all branches          ( +-  1.45% )

        0.00035664 +- 0.00000320 seconds time elapsed  ( +-  0.90% )

Notice that the naive version is not slower.
Look at the assembly code, if(i % 3) is:

        movl    -4(%rbp), %ecx
        movl    %ecx, %edx
        movl    $2863311531, %eax
        imulq   %rdx, %rax
        shrq    $32, %rax
        shrl    %eax
        movl    %eax, %edx
        addl    %edx, %edx
        addl    %eax, %edx
        movl    %ecx, %eax
        subl    %edx, %eax
        testl   %eax, %eax
        jne     .L3

2863311531 is 0xAAAAAAB, which is

⌈ 2^{33} / 3 ⌉

, which means that compiler converts division to multiplication just like we did.
Furthermore, the naive version does not call strncpy but the bitwise version does

strncpy(fmt,"FizzBuzz",8)is converted to

        leaq    -13(%rbp), %rax
        movabsq $8825495348169042246, %rsi
        movq    %rsi, (%rax)

strncpy(fmt, &"FizzBuzz%u"[(8 >> div5) >> (div3 << 2)], length)is converted to

        movl    -12(%rbp), %edx
        movzbl  -6(%rbp), %eax
        movl    $8, %esi
        movl    %eax, %ecx
        sarl    %cl, %esi
        movzbl  -5(%rbp), %eax
        sall    $2, %eax
        movl    %eax, %ecx
        sarl    %cl, %esi
        movl    %esi, %eax
        cltq
        leaq    .LC0(%rax), %rcx
        leaq    -21(%rbp), %rax
        movq    %rcx, %rsi
        movq    %rax, %rdi
        call    strncpy

2022q1 Homework2 (quiz2)

Problem1

Problem 2

Problem 3

Problem 4

Problem 5

Improvement

Problem 6

Problem 7

Read more

[ESCA](https://eecheng87.github.io/ESCA/)

2022q1 Homework6 (ktcp)

2022q1 Homework5 (quiz8)

2022q1 Homework5 (quiz6)