Try   HackMD

2022q1 Homework4 (quiz4)

tags: linux2022

測驗題目

測驗 1

延伸第 3 週測驗題的測驗 7,已知輸入必為大於 0 的數值 (即 x > 0),以下程式碼可計算

[log2x],也就是 ceil 和 log2 的組合並轉型為整數:

int ceil_log2(uint32_t x)
{
    uint32_t r, shift;

    x--;
    r = (x > 0xFFFF) << 4;                     
    x >>= r;
    shift = (x > 0xFF) << 3;
    x >>= shift;
    r |= shift;
    shift = (x > 0xF) << 2;
    x >>= shift;
    r |= shift;
    shift = (x > 0x3) << 1;
    x >>= shift;
    return (EXP1) + 1;       
}

整個過程就是對半切看前半部是否有值,也就是 binary search,當剩下最後兩個 bits 時,參考 jim12312321 的化簡前步驟

r |= shift; shift = (x > 0x1) << 0; r |= shift;

其中第 2, 3 行可化簡成

r |= (x >> 1);

因此,EXP1 = r | shift | x >> 1

關於 x--

為何要一開始要 x-- ? 假設我們先不考慮 x--

  • 2n1<x<2n
    • 2
      為底取
      log
      n1<log2x<n
    • ceil
      等同於
      n<log2x+1<n+1
    • 取整數
      int (log2x+1)=n
  • x=2n
    時,若同樣以上面的操作,
    • 2
      為底取
      log
      log2x=n
    • ceil
      等同於
      log2x+1=n+1
    • 取整數
      int (log2x+1)=n+1
      ,答案錯了,應為
      n
  • x=2n1
    時,若同樣以上面的操作,
    • 2
      為底取
      log
      log2x=n1
    • ceil
      等同於
      log2x+1=n
    • 取整數
      int (log2x+1)=n
      ,答案錯了,應為
      n1

考慮 x--

  • x=2n
    時,
    x1<2n
  • x=2n1
    時,
    x1<2n1

如此一來在最後,取

ceil 與取整數步驟的答案就會是對的。另外,當 x = 0 時,x - 1 = 0xFFFFFFFF ,因為 0xFFFFFFFF + 1 = x = 0,因此這版本的 ceil_log2(0) = 32

維持 Branchless 並解決 x 為 0 時的特例

測驗 2

複習〈你所不知道的 C 語言: bitwise 操作〉,改寫第 3 週測驗題的測驗 11 裡頭的 fls 函式 (fls 意謂 “find last set”),使其得以計算 Find first set:

#define BITS_PER_BYTE 8 #define BITS_PER_LONG (sizeof(unsigned long) * BITS_PER_BYTE) #include <stddef.h> static inline size_t ffs(unsigned long x) { if (x == 0) return 0; size_t o = 1; unsigned long t = ~0UL; size_t shift = BITS_PER_LONG; shift >>= 1; t >>= shift; while (shift) { if ((EXP2) == 0) { x >>= shift; EXP3; } shift >>= 1; t >>= shift; } return o; }

假設 sizeof(unsigned long) = 8,第 15, 16 行得到 shift = 32, t = 0xFFFFFFFF,可以看出 t 是 bitmask,因此 EXP2 = x & tEXP3 = o += shift,也就是 binary search 作法

迴避 while, for, goto 等關鍵字以改寫出功能等價的實作如下

linux/include/asm-generic/bitops/ffs.h

static inline int ffs(int x)
{
	int r = 1;

	if (!x)
		return 0;
	if (!(x & 0xffff)) {
		x >>= 16;
		r += 16;
	}
	if (!(x & 0xff)) {
		x >>= 8;
		r += 8;
	}
	if (!(x & 0xf)) {
		x >>= 4;
		r += 4;
	}
	if (!(x & 3)) {
		x >>= 2;
		r += 2;
	}
	if (!(x & 1)) {
		x >>= 1;
		r += 1;
	}
	return r;
}

測驗 3

考慮以下改寫自 Linux 核心的程式碼:

struct foo_consumer {
    int (*handler)(struct foo_consumer *self, void *);
    struct foo_consumer *next;
};

struct foo {
    struct foo_consumer *consumers;
    unsigned long flags;
};

#include <stdbool.h>

/*
 * For foo @foo, delete the consumer @fc.
 * Return true if the @fc is deleted sfccessfully
 * or return false.
 */
static bool consumer_del(struct foo *foo, struct foo_consumer *fc)
{
    struct foo_consumer **con;
    bool ret = false;

    for (con = &foo->consumers; *con; EXP4) {
        if (*con == fc) {
            *con = EXP5;
            ret = true;
            break;
        }
    }

    return ret;
}

EXP4 = con = &(*con)->next
EXP5 = fc->next

可以參考 yaohwang99 的圖,很漂亮







%0


cluster0

struct foo


cluster1

struct foo_consumer


cluster2

struct foo_consumer


cluster3

struct foo_consumer



node0

consumers

flag



node1

handler

next



node0:consumers->node1





node2

handler

next



node1:next->node2





node3

handler

next



node2:next->node3





con
con



con->node1:next





fc
fc



fc->node2





測驗 4

以下嘗試用 lab0 提及的 setjmplongjmp,用以實作〈你所不知道的 C 語言: goto 和流程控制篇〉闡述的 coroutine,參考的程式執行輸出如下:

Task 0: n = 3
Task 1: n = 3
Task 0: resume
Task 1: resume
Task 0: resume
Task 0: complete
Task 1: resume
Task 1: complete

原始程式碼如下: (檔名 jmp.c)

#include <setjmp.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "list.h"

struct task {
    jmp_buf env;
    struct list_head list;
};

static LIST_HEAD(tasklist);
static void (**tasks)(void *);
static int ntasks;
static jmp_buf sched;






%0


cluster3

task


cluster0

tasklist


cluster1

task


cluster2

task



node0

next

prev



node1

env

list



node0:next->node1:list






node2

env

list



node1:list->node2:list






node3

env

list



node2:list->node3:list






node3:list->node0:prev






static void task_add(struct list_head *tasklist, jmp_buf env)
{
    struct task *t = malloc(sizeof(*t));
    memcpy(t->env, env, sizeof(jmp_buf));
    INIT_LIST_HEAD(&t->list);
    list_add_tail(&t->list, tasklist);
}

static void task_switch(struct list_head *tasklist)
{
    jmp_buf env;

    if (!list_empty(tasklist)) {
        struct task *t = list_first_entry(tasklist, struct task, list);
        EXP6;
        memcpy(env, t->env, sizeof(jmp_buf));
        free(t);
        longjmp(env, 1);
    }
}

static void task_join(struct list_head *tasklist)
{
    jmp_buf env;

    while (!list_empty(tasklist)) {
        struct task *t = list_first_entry(tasklist, struct task, list);
        EXP7;
        memcpy(env, t->env, sizeof(jmp_buf));
        free(t);
        longjmp(env, 1);
    }
}

void schedule(void)
{
    static int i;

    srand(0xCAFEBABE ^ (uintptr_t) &schedule); /* Thanks to ASLR */

    setjmp(sched);

    while (ntasks-- > 0) {
        int n = rand() % 5;
        tasks[i++](&n);
        printf("Never reached\n");
    }

    task_join(&tasklist);
}

/* A task yields control n times */

void task0(void *arg)
{
    jmp_buf env;
    static int n;
    n = *(int *) arg;

    printf("Task 0: n = %d\n", n);

    if (setjmp(env) == 0) {
        task_add(&tasklist, env);
        EXP8;
    }

    for (int i = 0; i < n; i++) {
        if (setjmp(env) == 0) {
            task_add(&tasklist, env);
            task_switch(&tasklist);
        }
        printf("Task 0: resume\n");
    }

    printf("Task 0: complete\n");
    longjmp(sched, 1);
}

void task1(void *arg)
{
    jmp_buf env;
    static int n;
    n = *(int *) arg;

    printf("Task 1: n = %d\n", n);

    if (setjmp(env) == 0) {
        task_add(&tasklist, env);
        EXP9;
    }

    for (int i = 0; i < n; i++) {
        if (setjmp(env) == 0) {
            task_add(&tasklist, env);
            task_switch(&tasklist);
        }
        printf("Task 1: resume\n");
    }

    printf("Task 1: complete\n");
    longjmp(sched, 1);
}

#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
int main(void)
{
    void (*registered_task[])(void *) = {task0, task1};
    tasks = registered_task;
    ntasks = ARRAY_SIZE(registered_task);

    schedule();

    return 0;
}

EXP6 = list_del(&t->list);
EXP7 = list_del(&t->list);
EXP8 = longjmp(sched, 1)
EXP9 = longjmp(sched, 1)

可參考 C 語言 setjmp 與 longjmp 函數用法教學

首先了解 main() 中的部分程式碼,並理解 function designator

static void (**tasks)(void *);

void (*registered_task[])(void *) = {task0, task1};
tasks = registered_task;
  • void (*registered_task[])(void *) = {task0, task1}
    • registered_task 是 an array of function designator
    • 根據 C99 規格書

      A function designator is an expression that has function type. Except when it is the operand of the sizeof operator or the unary & operator, a function designator with type ‘‘function returning type’’ is converted to an expression that has type ‘‘pointer to function returning type’’. (C99 6.3.2.4)

      The unary * operator denotes indirection. If the operand points to a function, the result is a function designator. (C99 6.5.3.2.4)

    • 因此,以下程式碼不管加幾個 * 結果都一樣,就跟
      ex
      一樣不管怎麼對
      x
      微分都是
      ex
    void (*registered_task[])(void *) = {****task0, ***task1};
    
  • static void (**tasks)(void *) = registered_task
    • tasks 是 a pointer to pointer to designator
    • 根據 C99 規格書

      Except when it is the operand of the sizeof operator or the unary & operator, or is a string literal used to initialize an array, an expression that has type ‘‘array of type’’ is converted to an expression with type ‘‘pointer to type’’ that points to the initial element of the array object and is not an lvalue. (C99 6.3.2.1)

    • 除非遇到 sizeof 或是 & 之外,array of type (在這就是指 function designator) 都會被直接解讀成 pointer to type (在這就是 pointer to designator),而這個 type 是根據 array 的第一個元素來決定的

在 assign 完 task0task1registered_task 後進行 schedule()

void schedule(void)
{
    static int i;

    srand(0xCAFEBABE ^ (uintptr_t) &schedule); /* Thanks to ASLR */

    setjmp(sched);

    while (ntasks-- > 0) {
        int n = rand() % 5;
        tasks[i++](&n);
        printf("Never reached\n");
    }

    task_join(&tasklist);
}
  • srand(0xCAFEBABE ^ (uintptr_t) &schedule)
    Address space layout randomization 簡寫為 ASLR,因為 ASLR 機制每次執行程式時 schedule 地址皆不同,因此可用於亂數種設定
  • setjmp(sched)
    在程式中標示一個目標位置(跳躍的目的地),jmp_buf sched 儲存程式跳躍時所需之資訊

    The setjmp() function saves various information about the calling environment (typically, the stack pointer, the instruction pointer, possibly the values of other registers and the signal mask) in the buffer env for later use by longjmp()

  • while (ntasks-- > 0)
    透過 int n = rand() % 5; tasks[i++](&n); 執行 tasks 內的 task0, task1,迴圈一開始 ntasks = 2,執行 task0, task1ntasks = 0,並在之後的 longjmp(sched) 回到 setjmp(sched) 地方,接著執行 task_join
void task0(void *arg)
{
    jmp_buf env;
    static int n;
    n = *(int *) arg;

    printf("Task 0: n = %d\n", n);

    if (setjmp(env) == 0) {
        task_add(&tasklist, env);
        longjmp(sched, 1);
    }

    for (int i = 0; i < n; i++) {
        if (setjmp(env) == 0) {
            task_add(&tasklist, env);
            task_switch(&tasklist);
        }
        printf("Task 0: resume\n");
    }

    printf("Task 0: complete\n");
    longjmp(sched, 1);
}
  • setjmp(env) == 0
    • 在直接呼叫 setjmp 函數時其傳回值為 0,若是透過 longjmp 跳回這裡時,其傳回值就會是呼叫 longjmp 時所指定的值 (task_joinlongjmp(env, 1) 會回傳 1)
    • env 透過 task_add 加入到 list 中
    • longjmp(sched, 1) 跳到 schedule(void)setjmp(sched)
    • 假設在 task_join 中透過 longjmp(env, 1) 跳到 task0 中的第一個 setjmp(env),因為回傳值為 1,因此開始執行 for loop
  • for loop
    • 再次 setjmp(env),表示之後的 longjmp(env, 1) 會跳來這
    • 因為 task_join 有使用到 list_del,在這裡就要再次 task_add,並執行 task_switch,作用與 task_join 一樣,將此 task 從 list 中移除,接著 longjmp
static void task_join(struct list_head *tasklist)
{
    jmp_buf env;

    while (!list_empty(tasklist)) {
        struct task *t = list_first_entry(tasklist, struct task, list);
        list_del(&t->list);
        memcpy(env, t->env, sizeof(jmp_buf));
        free(t);
        longjmp(env, 1);
    }
}
  • list_first_entry
    找到 list 中第一個 entry,並 delete 調後,longjmp 到對應的 env
  • longjmp(env, 1)
    跳到 env 對應的 task,並回傳 1
static void task_switch(struct list_head *tasklist)
{
    jmp_buf env;

    if (!list_empty(tasklist)) {
        struct task *t = list_first_entry(tasklist, struct task, list);
        list_del(&t->list);
        memcpy(env, t->env, sizeof(jmp_buf));
        free(t);
        longjmp(env, 1);
    }
}

測驗 5

〈你所不知道的 C 語言:前置處理器應用篇〉已提過若干 Linux 核心的巨集,不難發現這些巨集經歷多次演變。Linux 核心一度有個巨集 ACCESS_ONCE,其作用是確實地讀取所指定記憶體位址的內容值,且限這一次,其原始定義如下:

#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) 

注意 volatile 關鍵字的使用
ACCESS_ONCE 巨集的使用情境,可參照 Linux v4.0 的 kernel/locking/mutex.c:

    while (true) {
        struct task_struct *owner;
        ...  
        /*
         * If there's an owner, wait for it to either
         * release the lock or go to sleep.
         */  
        owner = ACCESS_ONCE(lock->owner);  
        if (owner && !mutex_spin_on_owner(lock, owner))
            break;  

然而,如果不使用 ACCESS_ONCE 巨集,程式碼如下:

while (true) { struct task_struct *owner; ... /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ owner = lock->owner; if (owner && !mutex_spin_on_owner(lock, owner)) break;

由於,編譯器偵測到第 8 行的 owner = lock->owner 在迴圈中沒有被更改,所以其最佳化機制可能將第 8 行的程式碼搬出 while 迴圈之外,如此不用每次都實際地讀取 lock->owner,其程式碼變成:

    struct task_struct *owner = lock->owner;

    while (true) {  
        ...  
        if (owner && !mutex_spin_on_owner(lock, owner))
            break;

但問題來了,lock->owner 有可能被其它核心執行緒修改,從而造成資料不一致。因此使用 ACCESS_ONCE 巨集可以防止編譯器做相關最佳化工作,並確保每次都能到實體記憶體位址讀取。其做法便是將某參數暫時性地轉換成具備 volatile 的型態。如此,存取該參數在非引入 ACESS_ONCE 巨集之處 (不具 volatile 特性),仍可享用編譯器最佳化的好處。

ACCESS_ONCE() and compiler bugs 則提及 Linux 核心捨棄上述 ACCESS_ONCE 巨集,改為 READ_ONCE 巨集。在 Linux 核心原始程式碼曾存在以下註解:

ACCESS_ONCE will only work on scalar types. For union types, ACCESS_ONCE on a union member will work as long as the size of the member matches the size of the union and the size is smaller than word size.

The major use cases of ACCESS_ONCE used to be

  1. Mediating communication between process-level code and irq/NMI handlers, all running on the same CPU, and
  2. Ensuring that the compiler does not fold, spindle, or otherwise mutilate accesses that either do not require ordering or that interact with an explicit memory barrier or atomic instruction that provides the required ordering.

以下是可能的實作:

#include <stdint.h>
#include <string.h>                       
#include <stdlib.h>

#define __READ_ONCE_SIZE                                  \
    ({                                                    \
        switch (size) {                                   \
        case 1:                                           \
            *(uint8_t *) res = *(volatile uint8_t *) p;   \
            break;                                        \
        case 2:                                           \
            *(uint16_t *) res = *(volatile uint16_t *) p; \
            break;                                        \
        case 4:                                           \
            *(uint32_t *) res = *(volatile uint32_t *) p; \
            break;                                        \
        case 8:                                           \
            *(uint64_t *) res = *(volatile uint64_t *) p; \
            break;                                        \
        default:                                          \
            memcpy((void *) res, (const void *) p, size); \
        }                                                 \
    })

static inline void __read_once_size(const volatile void *p, void *res, int size)
{
    __READ_ONCE_SIZE;
}

#define READ_ONCE(x)                                \
    ({                                              \
        union {                                     \
            typeof(x) __val;                        \
            DECL0;    // uint8_t                    \
        } __u;                                      \
        __read_once_size(&(x), __u.__c, sizeof(x)); \
        __u.__val;                                  \
    })

READ_ONCE 巨集會判斷變數的寬度,針對寬度為 1, 2, 4, 8 位元組的變數,將其轉換為對應的純量 (scalar) 型態並增加 volatile 關鍵字,而對於其他資料寬度的類型,改呼叫 memcpy 來避免編譯器對該變數進行最佳化。

ACCESS_ONCE() and compiler bugs 提到 Christian 說 GCC 4.6 與 4.7 版本當變數不是 scalar type 會捨棄 volatile 修飾字

C99 §6.2.5.21 定義了 Arithmetic types 與 pointer types 是 scalar types,而 Array 與 structure types 是 aggregate types.

Note that aggregate type does not include union type because an object with union type can only contain one member at a time

觀察 __read_once_size(&(x), __u.__c, sizeof(x))sizeof(x) 的值會是 1, 2, 4, 8, 其他 (>8),透過 *(volatile uint8_t *) p 強制將 p 轉型成 scalar type,避免編譯器出錯

Linus 說到通常不 workaround 編譯器出錯,我們要盡量使程式碼less fragile

__u.__ctypeof(x) __val 共用一塊記憶體空間,因為 sizeof(x) 的值會是 1, 2, 4, 8, 其他 (>8),可知最小值是 1,則 DECL0 = uint8_t __c[1]

READ_ONCE / WRITE_ONCE 巨集的實作和其演化

並行程式設計: Atomics 操作 提到 在 Linux 核心使用 volatile 的場景,絕大部分是誤用,搭配 Why the "volatile" type class should not be used 可以整理如下:

  • volatile 不能視為一種簡單的 atomic 變數
  • volatile 會抑制編譯器最佳化,要求編譯器每次使用該變數時,都要從記憶體地址中讀出最新內容,但不能保證 CPU 不會遭遇重排
  • 當程式碼寫得得宜,volatile 只會拖慢速度,可以使用 spinlocks, mutexes, memory barriers 等方法,spinlocks 的舉例如下
spin_lock(&the_lock);
do_something_on(&shared_data);
do_something_else_with(&shared_data);
spin_unlock(&the_lock);

跟著 並行程式設計: Atomics 操作 做實驗,實作 Dekker’s Algorithm

flag1flag2counter 定義為 volatile 的 Global 變數

static volatile int flag1 = 0, flag2 = 0, turn = 1;
static volatile int counter = 0;
int loop_cnt;

dekker1 要執行時就舉旗,並將 turn 交給 dekker2,要做的事情就是 counter++

static void dekker1(void)
{
    flag1 = 1;
    turn = 2;
    // __atomic_thread_fence(__ATOMIC_SEQ_CST);
    while ((flag2 == 1) && (turn == 2))
        ;
    /* critical section */
    counter++;
    /* let the other task run */
    flag1 = 0;
}

static void dekker2(void)
{
    flag2 = 1;
    turn = 1;
    // __atomic_thread_fence(__ATOMIC_SEQ_CST);
    while ((flag1 == 1) && (turn == 1))
        ;
    /* critical section */
    counter++;
    /* leave critical section */
    flag2 = 0;
}

static void *task1(void *arg)
{
    printf("Starting %s\n", __func__);
    for (int i = loop_cnt; i > 0; i--)
        dekker1();
    return NULL;
}

static void *task2(void *arg)
{
    printf("Starting %s\n", __func__);
    for (int i = loop_cnt; i > 0; i--)
        dekker2();
    return NULL;
}

expected_sum = 2 * loop_cnt 主要是我們的預期總和與迴圈次數設定關係

int main(int argc, char **argv)
{
    pthread_t thread1, thread2;

    if (argc != 2) {
        fprintf(stderr, "Usage: %s <loopcount>\n", argv[0]);
        exit(1);
    }

    loop_cnt = atoi(argv[1]); /* FIXME: format checks */
    int expected_sum = 2 * loop_cnt;

    (void) pthread_create(&thread1, NULL, task1, NULL);
    (void) pthread_create(&thread2, NULL, task2, NULL);

    void *ret;
    (void) pthread_join(thread1, &ret);
    (void) pthread_join(thread2, &ret);
    printf("Both threads terminated\n");

    /* Check result */
    if (counter != expected_sum) {
        printf("[-] Dekker did not work, sum %d rather than %d.\n", counter,
               expected_sum);
        printf("%d missed updates due to memory consistency races.\n",
               (expected_sum - counter));
        return 1;
    }
    printf("[+] Dekker worked.\n");
    return 0;
}

實驗結果如下

$ gcc -O2 -o dekker dekker.c -lpthread
$ ./dekker 10000000
Starting task1
Starting task2
Both threads terminated
[-] Dekker did not work, sum 19999792 rather than 20000000.
208 missed updates due to memory consistency races.

觀察以下的組合語言做了什麼事

flag1 = 1;
turn = 2;
while ((flag2 == 1) && (turn == 2))
L5:
    movl    $1, flag1(%rip)
    movl    $2, turn(%rip)
    jmp .L4 
.L11:
    movl    turn(%rip), %eax
    cmpl    $2, %eax
    jne .L3 
.L4:
    movl    flag2(%rip), %eax
    cmpl    $1, %eax
    je  .L11

可以看見 volatile 只保證將變數讀寫至主記憶體,而 CPU 在執行過程中重排,使得該 load 操作讀到「舊值」,從而導致混亂,若要輸出正確,改用 __atomic_thread_fence(__ATOMIC_SEQ_CST) 即可,是 memory barrier 的作法,__ATOMIC_SEQ_CST 要求 thread 間 sequentially consistent:不相關的變數也必須滿足 happens-before

觀察使用 __atomic_thread_fence(__ATOMIC_SEQ_CST) 對應的組合語言輸出

flag1 = 1;
turn = 2;
__atomic_thread_fence(__ATOMIC_SEQ_CST);
while ((flag2 == 1) && (turn == 2));
.L16:
    movl    $1, flag1(%rip)
    movl    $2, turn(%rip)
    mfence
    jmp .L15
.L21:
    movl    turn(%rip), %eax
    cmpl    $2, %eax
    jne .L14
.L15:
    movl    flag2(%rip), %eax
    cmpl    $1, %eax
    je  .L21

include/linux/compiler.h

#define READ_ONCE(x)			
({							
    union { typeof(x) __val; char __c[1]; } __u =	
            { .__c = { 0 } };			
    // 注意是用 sizeof(x)
    __read_once_size(&(x), __u.__c, sizeof(x));
    __u.__val;					
})

#define WRITE_ONCE(x, val)				
({							
    union { typeof(x) __val; char __c[1]; } __u =	
            { .__val = (val) }; 			
    __write_once_size(&(x), __u.__c, sizeof(x));	
    __u.__val;					
})

static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
{
    switch (size) {
    case 1: *(__u8_alias_t  *) res = *(volatile __u8_alias_t  *) p; break;
    case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break;
    case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break;
    case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break;
    default:
	barrier();
	__builtin_memcpy((void *)res, (const void *)p, size);
	barrier();
    }
}

static __always_inline void __write_once_size(volatile void *p, void *res, int size)
{
    switch (size) {
    case 1: *(volatile  __u8_alias_t *) p = *(__u8_alias_t  *) res; break;
    case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break;
    case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break;
    case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break;
    default:
	barrier();
	__builtin_memcpy((void *)p, (const void *)res, size);
	barrier();
    }
}