2020q3 Homework3 (dict)

# 2020q3 Homework3 (dict) contributed by < `Holycung` > [2020q3 Homework3 (dict) 題目](https://hackmd.io/@sysprog/2020-dict) ## 目錄 [TOC] ## 閱讀程式碼 ### `test_common.c` 有兩種模式 CPY 和 REF 根據輸入的參數決定。 ```cpp= if (!strcmp(argv[1], "CPY") || (argc > 2 && !strcmp(argv[2], "CPY"))) { CPYmask = 0; REF = DEL; printf("CPY mechanism\n"); } else printf("REF mechanism\n"); ``` 如果是 REF 模式了話，會配置一個 memory pool 到 Top。 ```cpp= if (CPYmask) { // Only allacte pool in REF mechanism pool = malloc(poolsize); if (!pool) { fprintf(stderr, "Failed to allocate memory pool.\n"); return 1; } Top = pool; } ``` 再來會去開起檔案 `cities.txt`。 ```cpp= #define IN_FILE "cities.txt" FILE *fp = fopen(IN_FILE, "r"); ``` 第 74 行會先初始化 bloom filter 結構。 ```cpp=74 bloom_t bloom = bloom_create(TableSize); ``` 第 77 行這邊是用 `fgets` 讀取檔案並進行 parse，把逗號跟 newline parse 掉，然後透過 `tst_ins` 加入到 tst，`bloom_add` 加入到 bloom filter。 ```cpp=76 char buf[WORDMAX]; while (fgets(buf, WORDMAX, fp)) { int offset = 0; for (int i = 0, j = 0; buf[i + offset]; i++) { Top[i] = (buf[i + j] == ',' || buf[i + j] == '\n') ? '\0' : buf[i + j]; j += (buf[i + j] == ','); } while (*Top) { if (!tst_ins(&root, Top, REF)) { /* fail to insert */ fprintf(stderr, "error: memory exhausted, tst_insert.\n"); fclose(fp); return 1; } bloom_add(bloom, Top); idx++; int len = strlen(Top); offset += len + 1; Top += len + 1; } Top -= offset & ~CPYmask; memset(Top, '\0', WORDMAX); } ``` ### `tst.c` ternary search tree node. ```cpp= typedef struct tst_node { char key; /* char key for node (null for node with string) */ unsigned refcnt; /* refcnt tracks occurrence of word (for delete) */ struct tst_node *lokid; /* ternary low child pointer */ struct tst_node *eqkid; /* ternary equal child pointer */ struct tst_node *hikid; /* ternary high child pointer */ } tst_node; ``` #### `tst_ins` 把 `s` insert 到 tst 當中。第 221 行可以看到，如果是 cpy mode 每次都會用 `strdup` 複製字串到新的空間 `eqdata` 存起來。如果是 ref mode 則是把 `s` 的地址直接存起來。 ```cpp=183 void *tst_ins(tst_node **root, const char *s, const int cpy) { const char *p = s; tst_node *curr, **pcurr; if (!root || !s) return NULL; /* validate parameters */ if (strlen(s) + 1 > STKMAX / 2) /* limit length to 1/2 STKMAX */ return NULL; /* 128 char word length is plenty */ pcurr = root; while ((curr = *pcurr)) { if (*p == 0 && curr->key == 0) { curr->refcnt++; return (void *) curr->eqkid; } pcurr = next_node(pcurr, &p); } /* if not duplicate, insert remaining chars into tree rooted at curr */ for (;;) { /* allocate memory for node, and fill. use calloc (or include * string.h and initialize w/memset) to avoid valgrind warning * "Conditional jump or move depends on uninitialised value(s)" */ if (!(*pcurr = calloc(1, sizeof **pcurr))) { fprintf(stderr, "error: tst_insert(), memory exhausted.\n"); return NULL; } curr = *pcurr; curr->key = *p; curr->refcnt = 1; curr->lokid = curr->hikid = curr->eqkid = NULL; /* Place nodes until end of the string, at end of stign allocate * space for data, copy data as final eqkid, and return. */ if (*p++ == 0) { if (cpy) { /* allocate storage for 's' */ const char *eqdata = strdup(s); if (!eqdata) return NULL; curr->eqkid = (tst_node *) eqdata; return (void *) eqdata; } else { /* save pointer to 's' (allocated elsewhere) */ curr->eqkid = (tst_node *) s; return (void *) s; } } pcurr = &(curr->eqkid); } } ``` ### `bloom.c` #### `bloom_create` 用來初始化大小為 `TableSize` 的 bloom_filter。 ```cpp= #define TableSize 5000000 /* size of bloom filter */ struct bloom_filter { struct bloom_hash *func; void *bits; size_t size; }; ``` 這邊選用 `calloc` 因為要確保 memory content 都被設為 0，然後可以看到這邊是從 byte-level 改成 bit-level 節省空間。最後再用 `bloom_add_hash` 把兩個 hash function `djb2`、`jenkins` 加入結構當中。 ```cpp= bloom_t bloom_create(size_t size) { bloom_t res = calloc(1, sizeof(struct bloom_filter)); res->size = size; res->bits = calloc((size + 7) >> 3, 1); bloom_add_hash(res, djb2); bloom_add_hash(res, jenkins); return res; } ``` #### `bloom_add` 將傳進來的 item 進行 hash function，並更新 `bits` 陣列。 ```cpp= void bloom_add(bloom_t filter, const void *item) { struct bloom_hash *h = filter->func; uint8_t *bits = filter->bits; while (h) { unsigned int hash = h->func(item); hash %= filter->size; bits[hash >> 3] |= 0x80 >> (hash & 7); h = h->next; } } ``` #### `bloom_test` 測試給定的 item 是否存在，如果 hash 過後的位元都是 1 就代表存在。 ```cpp= bool bloom_test(bloom_t filter, const void *item) { struct bloom_hash *h = filter->func; uint8_t *bits = filter->bits; while (h) { unsigned int hash = h->func(item); hash %= filter->size; if (!(bits[hash >> 3] & (0x80 >> (hash & 7)))) { return false; } h = h->next; } return true; } ``` ## 參考資料 - [Bloom Filter概念和原理](https://blog.csdn.net/jiaomeng/article/details/1495500)