Try   HackMD

2020q1 Homework2 (quiz2)

contributed by < pingsutw >

重新回答第 2 周測驗題

解題思路

xs_new 分別依據 len 大小,來判斷新的字串要存 heap or stack, 當字串大於 16 時,資料存在 heap,所以這邊 AAA 代表 16

xs *xs_new(xs *x, const void *p)
{
    *x = xs_literal_empty();
    size_t len = strlen(p) + 1;
    if (len > AAA) {
        x->capacity = ilog2(len) + 1;
        x->size = len - 1;
        x->is_ptr = true;
        x->ptr = malloc((size_t) 1 << x->capacity);
        memcpy(x->ptr, p, len);
    } else {
        memcpy(x->data, p, len);
        x->space_left = 15 - (len - 1);
    }
    return x;
}

BBB = 32 因為 char 0~255 分別代表不同字元,mask 每單位代表 8 個字元 8*32 = 256, 每個 bit 分別代表 char 不同字元

CCC = & 因為 mask 裡的每個 bit 代表要刪除的字元,跟 mask 裡的每個 bit 做 &,如果是0代表沒有找到符合的字元,反之刪除此字元

xs *xs_trim(xs *x, const char *trimset)
{
    if (!trimset[0])
        return x;

    char *dataptr = xs_data(x), *orig = dataptr;

    /* similar to strspn/strpbrk but it operates on binary data */
    uint8_t mask[BBB] = {0};

#define check_bit(byte) (mask[(uint8_t) byte / 8] CCC 1 << (uint8_t) byte % 8)
#define set_bit(byte) (mask[(uint8_t) byte / 8] |= 1 << (uint8_t) byte % 8)

    size_t i, slen = xs_size(x), trimlen = strlen(trimset);

    for (i = 0; i < trimlen; i++)
        set_bit(trimset[i]);
    for (i = 0; i < slen; i++)
        if (!check_bit(dataptr[i]))
            break;
    for (; slen > 0; slen--)
        if (!check_bit(dataptr[slen - 1]))
            break;
    dataptr += i;
    slen -= i;

    /* reserved space as a buffer on the heap.
     * Do not reallocate immediately. Instead, reuse it as possible.
     * Do not shrink to in place if < 16 bytes.
     */
    memmove(orig, dataptr, slen);
    /* do not dirty memory unless it is needed */
    if (orig[slen])
        orig[slen] = 0;

    if (xs_is_ptr(x))
        x->size = slen;
    else
        x->space_left = 15 - slen;
    return x;
#undef check_bit
#undef set_bit
}

延伸問題

  • 解釋上述程式碼運作原理,並提出改進方案,可善用 GDB 追蹤和分析;
  • 以上述程式碼為基礎,提供字串複製 (應考慮到 CoW) 和類似 strtok 功能的函式實作;
  • 設計實驗,確認上述字串處理最佳化 (針對空間效率) 帶來的效益,應考慮到 locality of reference;
  • 在 Linux 核心原始程式碼中,找出類似手法的 SSO 或 CoW 案例並解說;

xs_copy

字串在 stack 時,複製 string 到 dest,當字串存在 heap 時,有指針指向 src的 data,避免在大字串額外的記憶體複製

  • cow (Copy on write)
    加一個 int pointer 表示 refernece count, refcnt = NULL, 代表沒有其他指標指向同一快記憶體空間 ptr,反之 > 1 代表有 1 個以上同時指向同一個記憶體空間
struct {
    char *ptr;
    /* supports strings up to 2^54 - 1 bytes */
    size_t size : 54,
        /* capacity is always a power of 2 (unsigned)-1 */
        capacity : 6;
    /* the last 4 bits are important flags */
    int *refcnt;
  };
xs *xs_copy(xs *dest, xs *src) {
  if (xs_is_ptr(src)) {
    printf("Data on heap\n");
    dest->is_ptr = true;
    dest->ptr = src->ptr;
    dest->size = xs_size(src);
    if (!src->refcnt) {
      dest->refcnt = src->refcnt = (int *)malloc(sizeof(int));
      *(dest->refcnt) = 1;
    } else {
      dest->refcnt = src->refcnt;
      *(dest->refcnt) += 1;
    }
  } else {
    printf("Data on stack\n");
    memcpy(dest->data, src->data, 16);
    dest->is_ptr = false;
    dest->space_left = 15 - xs_size(src);
  }
  return dest;
}

測試

  1. 先做 concat 讓 string 存在 heap 裡
  2. copy string,此時不會新增一段新的記憶體空間,而是指標指向 string 的位址
  3. 對 copy string 做 trim 此時才會新增新的記憶體空間,並對字串進行修改,可發現記憶體位址跟 string 已經不一樣了
xs copy;
string = *xs_tmp("foobarbar");
prefix = *xs_tmp("(((((");
suffix = *xs_tmp(")))))");

xs_concat(&string, &prefix, &suffix);
xs_copy(&copy, &string);
printf("\nBefore trim\n");
printf("[%s], %2zu\n", xs_data(&string), xs_size(&string));
printf("[%s], %2zu\n", xs_data(&copy), xs_size(&copy));
printf("string %p\ncopy   %p\n", xs_data(&string), xs_data(&copy));
xs_trim(&copy, "\n ");
printf("\nAfter trim\n");
printf("[%s], %2zu\n", xs_data(&copy), xs_size(&copy));
printf("string %p\ncopy   %p\n", xs_data(&string), xs_data(&copy));

輸出結果

Before trim
[(((((foobarbar)))))], 19
[(((((foobarbar)))))], 19
string 0x55555555a6a0
copy   0x55555555a6a0

After trim
[(((((foobarbar)))))], 19
string 0x55555555a6a0
copy   0x55555555a6f0

xs_strtok

  • 將字串分割成一個個片段, x 欲被分解的字串, delim 分隔的字串(符號), 用 strspn 找出分解後的字串的起始點,strpbrk 找出分解後的字串的的終點,並設為 \0 代表字串
  • lastToken 代表下一個分解後的字串的起始點
char *xs_strtok(xs *x, const char *delim) {
  char static *lastToken = NULL;
  char *tmp = NULL;
  char *str = NULL;
    if ( x == NULL ) {
        str = lastToken;
        if (str == NULL)  
            return NULL;
    } else {
        str = xs_data(x);
        str += strspn(str, delim);
    }
 
    /* Find end of segment */
    tmp = strpbrk(str, delim);
    if (tmp) {
        *tmp = '\0';
        lastToken = tmp + 1;
    } else {
        lastToken = NULL;
    }
 
    return str;
}

測試

char delim[8] = "- ", *tmp;
string = *xs_tmp("foo-bar-bar");
printf("\nBefore xs_strtok \n");
printf("[%s], %2zu\n", xs_data(&string), xs_size(&string));
printf("delim = %s\n", delim);

printf("%s\n", xs_strtok(&string, delim));
while(tmp = xs_strtok(NULL, delim))
    printf("%s\n", tmp);

輸出結果

Before xs_strtok 
[foo-bar-bar], 11
delim = - 
foo
bar
bar

程式優化

TODO (Kevin) 重新測量,把 outlier 去除

  • 優化前, 分別測量一百萬次,再取平均
    測試
struct timespec start, stop;
double t1 = 0, t2 = 0;
int times = 1000000;

xs string = *xs_tmp("\n foobarbar \n\n\n");
xs prefix = *xs_tmp("((("), suffix = *xs_tmp(")))");
for(int i = 0; i < times; i++){
    clock_gettime(CLOCK_MONOTONIC, &start);
    xs_trim(&string, "\n ");
    clock_gettime(CLOCK_MONOTONIC, &stop);
    t1 += diff_in_ns(start, stop);
}
printf("[%s] : %2zu\n", xs_data(&string), xs_size(&string));

for (int i = 0; i < times; i++){
    clock_gettime(CLOCK_MONOTONIC, &start);
    xs_concat(&string, &prefix, &suffix);
    clock_gettime(CLOCK_MONOTONIC, &stop);
    t2 += diff_in_ns(start, stop);
}
printf("[%s] : %2zu\n", xs_data(&string), xs_size(&string));

printf("trim : %.10lf ns concat : %.10lf ns\n", t1/times, t2/times);

輸出結果

[foobarbar] :  9
[(((((((((((((((((((((((()))] : 21
trim : 67.6565360000 ns concat : 83.8203580000 ns

參考資料

- [size_t strspn(const char *str1, const char *str2)](http://tw.gitbook.net/c_standard_library/c_function_strspn.html) - [char *strpbrk(const char *str1, const char *str2)](http://tw.gitbook.net/c_standard_library/c_function_strpbrk.html)

關於 C 標準函式庫的參考資料,務必以 Linux man page 為主,方可知道語言規範和實作的議題。避免閱讀第 N 手材料

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
jserv

- C 語言:typedef 的用法

改為查閱 C 語言規格書!

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
jserv

tags: sysprog2020