Try   HackMD

2020q1 第 9 週測驗題

tags: linux2020

目的: 檢驗學員對 Linux: 淺談同步機制Concurrent Programming 的認知

作答表單


測驗 1

以下是個分析 mutex lock/unlock 操作的執行成本的程式: (檔名 mutexperf.c,以 $ gcc -o mutexperf mutexperf.c -lpthread 編譯)

#define _GNU_SOURCE

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <sys/time.h>

struct test_results {
    int reps;
    int64_t start, stop;
};

static int64_t now_usecs(void) {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (int64_t) tv.tv_sec * 1000000 + tv.tv_usec;
}

/* Simply acquiring and releasing a lock, without any contention. */
static void lock_unlock(struct test_results *res) {
    pthread_mutex_t mutex;
    pthread_mutex_init(&mutex, NULL);

    res->start = now_usecs();
    for (int i = res->reps; i--;) {
        pthread_mutex_lock(&mutex);
        pthread_mutex_unlock(&mutex);
    }
    res->stop = now_usecs();

    pthread_mutex_destroy(&mutex);
}

static int cmp_int64(const void *ap, const void *bp) {
    int64_t a = *(int64_t *) ap, b = *(int64_t *) bp;

    if (a < b) return -1;
    if (a > b) return 1;
    return 0;
}

#define SETS 10
static void measure(void (*test)(struct test_results *res),
                    const char *name,
                    int reps) {
    printf("Measuring %s: ", name);
    fflush(stdout);

    struct test_results res = { .reps = reps };
    int64_t times[SETS];
    for (int i = 0; i < SETS; i++) {
        test(&res);
        times[i] = res.stop - res.start;
    }

    qsort(times, SETS, sizeof(int64_t), cmp_int64);
    printf("best %d ns, 50%%ile %d ns\n", (int) (times[0] * 1000 / reps),
           (int) (times[SETS / 2] * 1000 / reps));
}

int main(void) {
    measure(lock_unlock, "Locking and unlocking without contention", 10000000);
    return 0;
}

參考執行輸出:

Measuring Locking and unlocking without contention: best 31 ns, 50%ile 32 ns

其中 50%ile (不要看成 idle) 表示第 50 百分位數

考慮到 lock contention,在 cmp_int64 函式之前,增加以下程式碼:

#define CONTENTION_THREAD_COUNT 4
#define CONTENTION_MUTEX_COUNT (CONTENTION_THREAD_COUNT + 1)

struct contention_info {
    pthread_mutex_t mutexes[CONTENTION_MUTEX_COUNT];

    pthread_mutex_t ready_mutex;
    pthread_cond_t ready_cond;
    int ready_count;

    int thread_reps;
};

struct contention_thread_info {
    struct contention_info *info;
    pthread_mutex_t start_mutex;
    int thread_index;
    int64_t start, stop;
};

static void *contention_thread(void *v_thread_info)
{
    struct contention_thread_info *thread_info = v_thread_info;
    struct contention_info *info = thread_info->info;
    int i = thread_info->thread_index;
    int reps = info->thread_reps;

    /* Lock our first mutex */
    pthread_mutex_lock(&info->mutexes[i]);

    /* Indicate that we are ready for the test. */
    pthread_mutex_lock(&info->ready_mutex);
    if (++info->ready_count == CONTENTION_THREAD_COUNT)
        AAA;
    pthread_mutex_unlock(&info->ready_mutex);

    /* Line up to start */
    pthread_mutex_lock(&thread_info->start_mutex);
    pthread_mutex_unlock(&thread_info->start_mutex);

    thread_info->start = now_usecs();

    for (int j = 1; j < reps; j++) {
        int next = (i + 1) % CONTENTION_MUTEX_COUNT;
        pthread_mutex_lock(&info->mutexes[next]);
        BBB;
        i = next;
    }

    thread_info->stop = now_usecs();

    pthread_mutex_unlock(&info->mutexes[i]);
    return NULL;
}

static void contention(struct test_results *res)
{
    struct contention_info info;
    struct contention_thread_info thread_infos[CONTENTION_THREAD_COUNT];
    pthread_t threads[CONTENTION_THREAD_COUNT];

    for (int i = 0; i < CONTENTION_MUTEX_COUNT; i++)
        pthread_mutex_init(&info.mutexes[i], NULL);

    pthread_mutex_init(&info.ready_mutex, NULL);
    pthread_cond_init(&info.ready_cond, NULL);
    info.ready_count = 0;
    info.thread_reps = res->reps / CONTENTION_THREAD_COUNT;

    for (int i = 0; i < CONTENTION_THREAD_COUNT; i++) {
        thread_infos[i].info = &info;
        thread_infos[i].thread_index = i;
        pthread_mutex_init(&thread_infos[i].start_mutex, NULL);
        pthread_mutex_lock(&thread_infos[i].start_mutex);
        pthread_create(&threads[i], NULL, contention_thread, &thread_infos[i]);
    }

    pthread_mutex_lock(&info.ready_mutex);
    while (info.ready_count < CONTENTION_THREAD_COUNT)
        CCC;
    pthread_mutex_unlock(&info.ready_mutex);

    for (int i = 0; i < CONTENTION_THREAD_COUNT; i++)
        pthread_mutex_unlock(&thread_infos[i].start_mutex);

    for (int i = 0; i < CONTENTION_THREAD_COUNT; i++) {
        pthread_join(threads[i], NULL);
        pthread_mutex_destroy(&thread_infos[i].start_mutex);
    }

    for (int i = 0; i < CONTENTION_MUTEX_COUNT; i++)
        pthread_mutex_destroy(&info.mutexes[i]);

    pthread_mutex_destroy(&info.ready_mutex);
    pthread_cond_destroy(&info.ready_cond);

    res->start = thread_infos[0].start;
    res->stop = thread_infos[0].stop;
    for (int i = 1; i < CONTENTION_THREAD_COUNT; i++) {
        if (thread_infos[i].start < res->start)
            res->start = thread_infos[i].start;
        if (thread_infos[i].stop > res->stop)
            res->stop = thread_infos[i].stop;
    }
}

並做了以下調整:

 int main(void)
 {
     measure(lock_unlock, "Locking and unlocking without contention", 10000000);
+    measure(contention, "Locking and unlocking with contention", 100000);
     return 0;
 }

參考的執行輸出:

Measuring Locking and unlocking without contention: best 31 ns, 50%ile 33 ns
Measuring Locking and unlocking with contention: best 5269 ns, 50%ile 5895 ns

基於效能的測試的目標,請補完程式碼。

作答區

AAA = ?

  • (a) pthread_cond_wait(&info.ready_cond, &info.ready_mutex)
  • (b) pthread_cond_signal(&info->ready_cond)
  • (c) pthread_mutex_unlock(&info->ready_mutex)

BBB = ?

  • (a) pthread_mutex_unlock(&info->mutexes[next])
  • (b) pthread_mutex_unlock(&info->mutexes[j])
  • (c) pthread_mutex_unlock(&info->mutexes[i])

CCC = ?

  • (a) pthread_cond_wait(&info.ready_cond, &info.ready_mutex)
  • (b) pthread_cond_signal(&info->ready_cond)
  • (c) pthread_mutex_unlock(&info->ready_mutex)

延伸問題:

  1. 解釋上述 mutexperf.c 效能分析工具的原始程式碼,指出其不足並改善
  2. 研讀論文 A Review of Lightweight Thread Approaches for High Performance Computing 並學習其中效能分析的手法,探討 Linux NPTL (即 native thread) 和 lightweight ULT 的表現