2020q3 Homework9 (quiz 9)

contributed by < JimmyLiu0530 >

tags: `進階電腦系統理論與實作`

測驗1: 用 ANN 推理出 2-input multiplexer 的邏輯

在若干作業系統 (如 Linux, OpenBSD, MS-Windows, macOS) 中，Address space layout randomization (簡稱 ASLR) 是種防範記憶體損壞漏洞被利用的電腦安全技術，透過隨機放置行程關鍵資料區域的定址空間，防止攻擊者能可靠地跳躍到記憶體的特定位置。我們可利用 ASLR 來設定亂數種子，例如:











static uint64_t splitmix_x;
static void splitmix_init()
{
    /* Hopefully, ASLR makes our function address random */
    uintptr_t x = (uintptr_t)((void *) &splitmix_init);
    struct timespec time;
    clock_gettime(CLOCK_MONOTONIC, &time);
    x ^= (uintptr_t) time.tv_sec;
    x ^= (uintptr_t) time.tv_nsec;
    splitmix_x = x;
}

由於函式 splitmix_init 的地址在 ASLR 生效後，可能會隨機得到不同的數值 (注意: 僅是「可能」，實際要看作業系統的表現)，我們可利用這特性來設定亂數種子 splitmix_x。

以下是一個類神經網路的 C 語言實作:
































































































































































































































































































#include <assert.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

struct ann;
typedef double (*ann_act_func)(const struct ann *ctx, double a);

typedef struct ann {
    int inputs, hidden_layers, hidden, outputs;
    ann_act_func activation_hidden; /* used for hidden neurons */
    ann_act_func activation_output; /* used for output */
    int total_weights;              /* weights and size of weights buffer */
    int total_neurons; /* neurons + inputs and size of output buffer */
    double *weight;    /* All weights */
    double *output;    /* store input array and output of each neuron */
    double *delta;     /* total_neurons - inputs */
} ann_t;

#define LOOKUP_SIZE 4096

static inline double ann_act_hidden_indirect(const struct ann *ctx, double a) {
    return ctx->activation_hidden(ctx, a);
}

static inline double ann_act_output_indirect(const struct ann *ctx, double a) {
    return ctx->activation_output(ctx, a);
}

static const double sigmoid_dom_min = -15.0, sigmoid_dom_max = 15.0;
static double interval, lookup[LOOKUP_SIZE];

#define unlikely(x) __builtin_expect(!!(x), 0)
#define UNUSED __attribute__((unused))

static double ann_act_sigmoid(const ann_t *ctx UNUSED, double a) {
    if (a < -45.0)
        return 0;
    if (a > 45.0)
        return 1;
    return 1.0 / (1 + exp(-a));
}

static void ann_init_sigmoid_lookup(const ann_t *ctx) {
    const double f = (sigmoid_dom_max - sigmoid_dom_min) / LOOKUP_SIZE;

    interval = LOOKUP_SIZE / (sigmoid_dom_max - sigmoid_dom_min);
    for (int i = 0; i < LOOKUP_SIZE; ++i)
        lookup[i] = ann_act_sigmoid(ctx, sigmoid_dom_min + f * i);
}

static double ann_act_sigmoid_cached(const ann_t *ctx UNUSED, double a) {
    assert(!isnan(a));

    if (a < sigmoid_dom_min)
        return lookup[0];
    if (a >= sigmoid_dom_max)
        return lookup[LOOKUP_SIZE - 1];

    size_t j = (a - sigmoid_dom_min) * interval + 0.5;
    if (unlikely(j >= LOOKUP_SIZE))
        return lookup[LOOKUP_SIZE - 1];

    return lookup[j];
}

/* conventional generator (splitmix) developed by Steele et al. */
static uint64_t splitmix_x;
static inline uint64_t splitmix() {
    splitmix_x += 0x9E3779B97F4A7C15;
    uint64_t z = splitmix_x;
    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9;
    z = (z ^ (z >> 27)) * 0x94D049BB133111EB;
    return z ^ (z >> 31);
}

static void splitmix_init() {
    /* Hopefully, ASLR makes our function address random */
    uintptr_t x = (uintptr_t)((void *) &splitmix_init);
    struct timespec time;
    clock_gettime(CLOCK_MONOTONIC, &time);
    x ^= (uintptr_t) time.tv_sec;
    x ^= (uintptr_t) time.tv_nsec;
    splitmix_x = x;

    /* do a few randomization steps */
    uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
    for (uintptr_t i = 0; i < max; i++)
        splitmix();
}

/* uniform random numbers between 0 and 1 */
#define ANN_RAND() (((double) splitmix()) / UINT64_MAX)

/* Set weights randomly */
void ann_randomize(ann_t *ctx) {
    for (int i = 0; i < ctx->total_weights; ++i)
        ctx->weight[i] = ANN_RAND() - 0.5; /* weights from -0.5 to 0.5 */
}

/* Create and return a new network context */
ann_t *ann_init(int inputs, int hidden_layers, int hidden, int outputs) {
    if (hidden_layers < 0)
        return 0;
    if (inputs < 1)
        return 0;
    if (outputs < 1)
        return 0;
    if (hidden_layers > 0 && hidden < 1)
        return 0;

    const int hidden_weights =
        hidden_layers ? (inputs + 1) * hidden +
                            (hidden_layers - 1) * (hidden + 1) * hidden
                      : 0;
    const int output_weights =
        (hidden_layers ? (WWW) : (inputs + 1)) * outputs;
    const int total_weights = (hidden_weights + output_weights);

    const int total_neurons = (inputs + hidden * hidden_layers + outputs);

    /* Allocate extra size for weights, outputs, and deltas. */
    const int size =
        sizeof(ann_t) + sizeof(double) * (total_weights + total_neurons +
                                          (total_neurons - inputs));
    ann_t *ret = malloc(size);
    assert(ret);

    ret->inputs = inputs;
    ret->hidden_layers = hidden_layers;
    ret->hidden = hidden;
    ret->outputs = outputs;

    ret->total_weights = total_weights;
    ret->total_neurons = total_neurons;

    ret->weight = (double *) ((char *) ret + sizeof(ann_t));
    ret->output = ret->weight + ret->total_weights;
    ret->delta = ret->output + ret->total_neurons;

    ann_randomize(ret);

    ret->activation_hidden = ann_act_sigmoid_cached;
    ret->activation_output = ann_act_sigmoid_cached;

    ann_init_sigmoid_lookup(ret);
    return ret;
}

/* Return a new copy of network context */
ann_t *ann_copy(ann_t const *ctx) {
    const int size = sizeof(ann_t) +
                     sizeof(double) * (ctx->total_weights + ctx->total_neurons +
                                       (ctx->total_neurons - ctx->inputs));
    ann_t *ret = malloc(size);
    assert(ret);

    memcpy(ret, ctx, size);

    ret->weight = (double *) ((char *) ret + sizeof(ann_t));
    ret->output = ret->weight + ret->total_weights;
    ret->delta = ret->output + ret->total_neurons;

    return ret;
}

/* Free the memory used by a network context */
void ann_free(ann_t *ctx) { free(ctx); }

/* Run the feedforward algorithm to calculate the output of the network. */
double const *ann_run(ann_t const *ctx, double const *inputs) {
    double const *w = ctx->weight;
    double *o = ctx->output + ctx->inputs;
    double const *i = ctx->output;

    /* Copy the inputs to the scratch area, where we also store each neuron's
     * output, for consistency. This way the first layer isn't a special case.
     */
    memcpy(ctx->output, inputs, sizeof(double) * ctx->inputs);

    if (!ctx->hidden_layers) {
        double *ret = o;
        for (int j = 0; j < ctx->outputs; ++j) {
            double sum = *w++ * (-1.0);
            for (int k = 0; k < ctx->inputs; ++k)
                sum += *w++ * i[k];
            *o++ = ann_act_output_indirect(ctx, sum);
        }
        return ret;
    }

    /* Figure input layer */
    for (int j = 0; j < ctx->hidden; ++j) {
        double sum = *w++ * (-1.0);
        for (int k = 0; k < ctx->inputs; ++k)
            sum += *w++ * i[k];
        *o++ = ann_act_hidden_indirect(ctx, sum);
    }

    i += ctx->inputs;

    /* Figure hidden layers, if any. */
    for (int h = 1; h < ctx->hidden_layers; ++h) {
        for (int j = 0; j < ctx->hidden; ++j) {
            double sum = *w++ * (-1.0);
            for (int k = 0; k < ctx->hidden; ++k)
                sum += *w++ * i[k];
            *o++ = ann_act_hidden_indirect(ctx, sum);
        }
        i += ctx->hidden;
    }

    double const *ret = o;

    /* Figure output layer. */
    for (int j = 0; j < ctx->outputs; ++j) {
        double sum = *w++ * (-1.0);
        for (int k = 0; k < ctx->hidden; ++k)
            sum += *w++ * i[k];
        *o++ = ann_act_output_indirect(ctx, sum);
    }

    assert(w - ctx->weight == ctx->total_weights);
    assert(o - ctx->output == ctx->total_neurons);

    return ret;
}

#define LOOP_MAX 1024

int main(int argc, char *argv[]) {
    splitmix_init();

    /* Input and expected out data for the 2-input Multiplexer */
    const double input[8][3] = {
        {0, 0, 0}, {0, 0, 1}, {0, 1, 0}, {0, 1, 1},
        {1, 0, 0}, {1, 0, 1}, {1, 1, 0}, {1, 1, 1},
    };
    const double output[8] = {
        0, 0, 1, 1, 0, 1, 0, 1,
    };

    /* New network with 3 inputs, 1 hidden layer of 2 neurons,
     * and 1 output.
     */
    ann_t *ctx = ann_init(3, 1, 2, 1);

    double err, last_err = LOOP_MAX;
    int count = 0;
    do {
        ++count;
        if (count % LOOP_MAX == 0) {
            ann_randomize(ctx);
            last_err = LOOP_MAX;
        }

        ann_t *save = ann_copy(ctx);

        /* Take a random guess at the ANN weights. */
        for (int i = 0; i < ctx->total_weights; ++i)
            ctx->weight[i] += RRR;

        err = 0;
        for (int k = 0; k < (1 << 3); k++)
            err += pow(*ann_run(ctx, input[k]) - output[k], 2.0);

        /* Keep these weights if they're an improvement. */
        if (err < last_err) {
            ann_free(save);
            last_err = err;
        } else {
            ann_free(ctx);
            ctx = save;
        }
    } while (err > 0.01);
    printf("Finished in %d loops.\n", count);

    /* Run the network and see what it predicts. */
    for (int k = 0; k < (1 << 3); k++)
        printf("[%1.f, %1.f, %1.f] = %1.f.\n", input[k][0], input[k][1],
               input[k][2], *ann_run(ctx, input[k]));

    ann_free(ctx);
    return 0;
}

上方的程式碼嘗試由 Multiplexer 的輸入和輸出，推理出其中邏輯。Multiplexing switch 的作用:

一個具備

2^{n}

輸入端的 Multiplexer (簡稱 MUX) 有

n

個可選擇的輸入－輸出線路，可經由控制端來選擇 (select) 其中一個訊號作為輸出。

2-input Multiplexer Design
出處: The Multiplexer

該程式預期的執行輸出如下:

Finished in 359 loops.
[0, 0, 0] = 0.
[0, 0, 1] = 0.
[0, 1, 0] = 1.
[0, 1, 1] = 1.
[1, 0, 0] = 0.
[1, 0, 1] = 1.
[1, 1, 0] = 0.
[1, 1, 1] = 1.

其中 359 可能會替換為其他數值。請研讀程式碼，並補完程式碼，使得類神經網路得以運作。

程式說明

splitmix_init() 在這程式中所扮演的角色?

這個程式主要透過 feedforward algorithm 以及隨機猜測 ANN 的 weights 來模擬一個 2-input 多工器的結果。

函式 ann_init:
- 根據傳入的參數，依序為 # of input neurons, # of hidden layers, # of neurons per hidden layer, 以及 # of output neurons 來配置空間給 ANN 中的參數並初始化。
  - 像是 hidden_weights 代表 input 跟 hidden、hidden 跟 hidden 間總共有多少個 weights (包含邊的 weights 以及點的 biases)。
  - 而 output_weights 則是代表 hidden (或 input) 跟 output 間的 weights 總數，因此，WWW = hidden + 1，因為 +1 for biases of output-layer neurons。
- 此外，在初始化的過程中，還會去呼叫函式 ann_randomize 來給予 ANN 中所有 weights 一個隨機的值，其中值介於 -0.5 ~ 0.5 。
- 最後，呼叫函式 ann_init_sigmoid_lookup 來預先計算好輸入介於 sigmoid_dom_min 與 sigmoid_dom_max 間的激勵函數值，並存於 lookup 表格中。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

NOTE:
這邊的預先計算並不是真的把所有在這範圍內的實數都算出來，因為我們 lookup 表格大小有限，因此只能將這個範圍切出 LOOKUP_SIZE 個區間，在相同區間內的 x 都會得到相同的函數值。

函數 ann_run
- line 184-193 這是 for 沒有 hidden layer 的 ANN
- 如有 hidden layer，主要分成三大類去計算他們的激勵函數值，input layer -> hidden layer、hidden layer -> hidden layer、以及 hidden layer -> output layer，但其實不管哪一類，操作都很相似。
- 舉 input layer -> hidden layer 為例:
  - 對第一層 hidden layer 的每個神經元去計算他們的激勵函數值。
  - 每個神經元的激勵函數的輸入為所有 input layer 的 neurons 的 feature values 乘上 weight，再加上 bias。透過呼叫函式 ann_act_hidden_indirect，查詢 lookup 表格，即可取得其激勵函數值，並存入 *o++
- 最後，所有 output layer 神經元的激勵函數值即為此次預測的結果，並回傳存儲存這些值的初始位置。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

NOTE:

*o++ 的意思是甚麼?

++ 在前的優先權與 * 相同，因此關聯性為從右到左
++ 在後的優先權高於 ++在前與 *，因此關聯性為從左到右







int a[]={10, 20};
int *o = a;

printf("%d\n", *o++); /* Get 10, and o points to the next element(i.e. 20) */
printf("%d\n", (*o)++); /* Get 10, after printing, 10 will be increased by 1 */
printf("%d\n", *++o); /* Get 20 */
printf("%d\n", ++*o); /* Get 11 */

除了 input layer 的神經元的 input 是先給定的之外，其餘神經元的激勵函數值會在計算下一層神經元的激勵函數時用到。

line 267-268 會計算此次預測的 loss function，並與上一次的預測做比較。若這次的預測比上次差，則捨棄這次的預測；否則捨棄上一次的預測。不斷猜測 ANN weights，直到預測的誤差小於等於 0.01。

延伸問題

1. 使用 back-propagation (反向傳遞) 改寫上述程式碼，並探討激勵函數的特性

2. 引入其他激勵函數，例如 ReLU，並確保針對其特性進行適度初始化

3. 比照 Tinn，搭配給定的 data set，實作出能夠辨識手寫印度—阿拉伯數字的類神經網路

參考 nanonn

4. 研究亂數相關文獻，並探討偽亂數產生器 (PRNG) 的實作

你的程式夠「亂」嗎？

亂數

The fastest conventional random number generator that can pass Big Crush?

ARM and Intel have different performance characteristics: a case study in random number generation

5. 研讀 Vectorizing random number generators for greater speed: PCG and xorshift128+，重現實驗並思考 SIMD 加速 PRNG 的機會

6. 研讀 Bitwise Neural Networks 和 Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1，摘錄論文重點，並試著在上述程式碼的基礎上予以實作

tinier-nn

測驗2: LeetCode 37. Sudoku Solver

數獨規則如下:

每列 (row) 必須出現 1 到 9 的數字，且只能出現一次
每行 (column) 必須出現 1 到 9 的數字，且只能出現一次
每個
$3 \times 3$ 構成的「宮」(即子格子，以粗框標記) 內，數字 1 到 9 必須出現，且只能出現一次

示範:

解法:

若每次挑出一空格，並展開完成該盤面的可能數字，嘗試展開所有解法路徑，可畫成一個樹 (tree) 來思考：

其中必定存在一條從 root 到 leaf 的路徑，能將其盤面填滿，並且盤面上的所有數字，能滿足數獨的所有條件規則 —— 題目假設存在唯一解。在決定空格能填什麼數字時，已將不符合規則的數字排除，以減少後續的計算量

參考解題思路

以下是個可能的程式碼實作:









































































#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>

typedef struct {
    uint16_t row[9], col[9], box[9];
} masks_t;

const uint16_t ALL_NUMS = 0x1FF;

static inline int bi(int x, int y)
{
    return (y / 3) * 3 + (x / 3);
}

static bool solve(char **board, masks_t *m)
{
    for (int y = 0; y < 9; ++y) {
        for (int x = 0; x < 9; ++x) {
            if (board[y][x] != '.')
                continue;

            int b = bi(x, y);
            uint16_t posmoves = m->row[x] & m->col[y] & m->box[b];
            while (posmoves) {
                size_t n = __builtin_ctz(posmoves);
                uint16_t k = ~(1U << n);
                board[y][x] = '1' + n;
                m->row[x] &= k, m->col[y] &= k, m->box[b] &= k;
                if (solve(board, m))
                    return true;
                m->row[x] |= ~k, m->col[y] |= ~k, m->box[b] |= ~k;
                posmoves &= MOVE;
            }
            board[y][x] = '.';
            return false;
        }
    }
    return true;
}

void solveSudoku(char **board, int boardSize, int *boardColSize)
{
    masks_t m;

    for (int i = 0; i < 9; ++i)
        m.row[i] = m.col[i] = m.box[i] = ALL_NUMS;
    for (int y = 0; y < 9; ++y) {
        for (int x = 0; x < 9; ++x) {
            if (board[y][x] != '.') {
                int n = board[y][x] - '1';
                uint16_t k = ~(1U << n);
                m.row[x] &= k, m.col[y] &= k, m.box[bi(x, y)] &= k;
            }
        }
    }

    for (int y = 0; y < 9; ++y) {
        for (int x = 0; x < 9; ++x) {
            if (board[y][x] != '.')
                continue;
            uint16_t mask = m.row[x] & m.col[y] & m.box[bi(x, y)];
            if ((mask & (MMM)) == 0) {
                size_t n = __builtin_ctz(mask);
                uint16_t k = ~(1U << n);
                board[y][x] = '1' + n;
                m.row[x] &= k, m.col[y] &= k, m.box[bi(x, y)] &= k;
            }
        }
    }

    solve(board, &m);
}

請補完程式碼，使得程式符合 LeetCode 題目要求。

程式說明

先考慮函式 solveSudoku :

m.row[i], m.col[i], m.box[i] 這三個陣列分別用來記錄第 i 行, 列, 3x3 格子中用過哪些數字，並以 bit 表示。舉例來說，m.row[0] = 0000 0000 0110 1010 代表在第 0 行，數字 2, 4, 6, 7 已被使用過。因此，在 line 48-56 根據現有的 9x9 棋盤格去更新每行、列以及 3x3 格子中，哪些數字被用過了。此外，bi(int x, int y) 函式的功能在於回傳 (x, y) 格子位於哪個 3x3 方格。

至於 line 58-70 主要針對棋盤格上的空格作處理，其中 mask 紀錄某個 1x1 格子可以填入的數字，若只有一種數字可以填，那麼直接填入該格，並更新 m.row[i], m.col[i], m.box[i]。因此 MMM = mask - 1。

上述作業完成後，呼叫函式 solve，主要利用遞迴地去檢查每一種可能到底有沒有符合數獨的規則。一旦有任何一個 1x1 格子沒有數字可以填，則直接回傳 false。

因為前面已經處理過只有一種數字可填的情況了，因此剩下的空格都是至少有兩種可能。line 25-34 就是在試看看每個格子的所有可能的數字，每試一種數字都會遞迴呼叫函式 solve 去檢查在這種嘗試下的棋盤格是否能完成數獨。若可以，則回傳 true；否則須將此種數字從 posmoves 中移除。因此，MOVE = posmoves - 1。

延伸問題

1. 重寫不同於上述的程式碼，效能表現應優於上述

2. 嘗試運用 SIMD 指令加速數獨求解

參考 Nerd Sniped: A Sudoku Story 及 simdoku

2020q3 Homework9 (quiz 9)

tags: 進階電腦系統理論與實作

測驗1: 用 ANN 推理出 2-input multiplexer 的邏輯

程式說明

延伸問題

1. 使用 back-propagation (反向傳遞) 改寫上述程式碼，並探討激勵函數的特性

2. 引入其他激勵函數，例如 ReLU，並確保針對其特性進行適度初始化

3. 比照 Tinn，搭配給定的 data set，實作出能夠辨識手寫印度—阿拉伯數字的類神經網路

4. 研究亂數相關文獻，並探討偽亂數產生器 (PRNG) 的實作

5. 研讀 Vectorizing random number generators for greater speed: PCG and xorshift128+，重現實驗並思考 SIMD 加速 PRNG 的機會

6. 研讀 Bitwise Neural Networks 和 Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1，摘錄論文重點，並試著在上述程式碼的基礎上予以實作

測驗2: LeetCode 37. Sudoku Solver

程式說明

延伸問題

1. 重寫不同於上述的程式碼，效能表現應優於上述

2. 嘗試運用 SIMD 指令加速數獨求解

Read more

台灣萬萬稅

CS:APP 第一章重點整理 - Introduction

GDB介紹

Ubuntu 20.04 wifi 選項消失之解決方法

tags: `進階電腦系統理論與實作`