# 2020q3 Homework3 (dict)
contributed by < `Holycung` >
[2020q3 Homework3 (dict) 題目](https://hackmd.io/@sysprog/2020-dict)
## 目錄
[TOC]
## 閱讀程式碼
### `test_common.c`
有兩種模式 CPY 和 REF 根據輸入的參數決定。
```cpp=
if (!strcmp(argv[1], "CPY") || (argc > 2 && !strcmp(argv[2], "CPY"))) {
CPYmask = 0;
REF = DEL;
printf("CPY mechanism\n");
} else
printf("REF mechanism\n");
```
如果是 REF 模式了話,會配置一個 memory pool 到 Top。
```cpp=
if (CPYmask) { // Only allacte pool in REF mechanism
pool = malloc(poolsize);
if (!pool) {
fprintf(stderr, "Failed to allocate memory pool.\n");
return 1;
}
Top = pool;
}
```
再來會去開起檔案 `cities.txt`。
```cpp=
#define IN_FILE "cities.txt"
FILE *fp = fopen(IN_FILE, "r");
```
第 74 行會先初始化 bloom filter 結構。
```cpp=74
bloom_t bloom = bloom_create(TableSize);
```
第 77 行這邊是用 `fgets` 讀取檔案並進行 parse,把逗號跟 newline parse 掉,然後透過 `tst_ins` 加入到 tst,`bloom_add` 加入到 bloom filter。
```cpp=76
char buf[WORDMAX];
while (fgets(buf, WORDMAX, fp)) {
int offset = 0;
for (int i = 0, j = 0; buf[i + offset]; i++) {
Top[i] =
(buf[i + j] == ',' || buf[i + j] == '\n') ? '\0' : buf[i + j];
j += (buf[i + j] == ',');
}
while (*Top) {
if (!tst_ins(&root, Top, REF)) { /* fail to insert */
fprintf(stderr, "error: memory exhausted, tst_insert.\n");
fclose(fp);
return 1;
}
bloom_add(bloom, Top);
idx++;
int len = strlen(Top);
offset += len + 1;
Top += len + 1;
}
Top -= offset & ~CPYmask;
memset(Top, '\0', WORDMAX);
}
```
### `tst.c`
ternary search tree node.
```cpp=
typedef struct tst_node {
char key; /* char key for node (null for node with string) */
unsigned refcnt; /* refcnt tracks occurrence of word (for delete) */
struct tst_node *lokid; /* ternary low child pointer */
struct tst_node *eqkid; /* ternary equal child pointer */
struct tst_node *hikid; /* ternary high child pointer */
} tst_node;
```
#### `tst_ins`
把 `s` insert 到 tst 當中。
第 221 行可以看到,如果是 cpy mode 每次都會用 `strdup` 複製字串到新的空間 `eqdata` 存起來。如果是 ref mode 則是把 `s` 的地址直接存起來。
```cpp=183
void *tst_ins(tst_node **root, const char *s, const int cpy)
{
const char *p = s;
tst_node *curr, **pcurr;
if (!root || !s)
return NULL; /* validate parameters */
if (strlen(s) + 1 > STKMAX / 2) /* limit length to 1/2 STKMAX */
return NULL; /* 128 char word length is plenty */
pcurr = root;
while ((curr = *pcurr)) {
if (*p == 0 && curr->key == 0) {
curr->refcnt++;
return (void *) curr->eqkid;
}
pcurr = next_node(pcurr, &p);
}
/* if not duplicate, insert remaining chars into tree rooted at curr */
for (;;) {
/* allocate memory for node, and fill. use calloc (or include
* string.h and initialize w/memset) to avoid valgrind warning
* "Conditional jump or move depends on uninitialised value(s)"
*/
if (!(*pcurr = calloc(1, sizeof **pcurr))) {
fprintf(stderr, "error: tst_insert(), memory exhausted.\n");
return NULL;
}
curr = *pcurr;
curr->key = *p;
curr->refcnt = 1;
curr->lokid = curr->hikid = curr->eqkid = NULL;
/* Place nodes until end of the string, at end of stign allocate
* space for data, copy data as final eqkid, and return.
*/
if (*p++ == 0) {
if (cpy) { /* allocate storage for 's' */
const char *eqdata = strdup(s);
if (!eqdata)
return NULL;
curr->eqkid = (tst_node *) eqdata;
return (void *) eqdata;
} else { /* save pointer to 's' (allocated elsewhere) */
curr->eqkid = (tst_node *) s;
return (void *) s;
}
}
pcurr = &(curr->eqkid);
}
}
```
### `bloom.c`
#### `bloom_create`
用來初始化大小為 `TableSize` 的 bloom_filter。
```cpp=
#define TableSize 5000000 /* size of bloom filter */
struct bloom_filter {
struct bloom_hash *func;
void *bits;
size_t size;
};
```
這邊選用 `calloc` 因為要確保 memory content 都被設為 0,然後可以看到這邊是從 byte-level 改成 bit-level 節省空間。最後再用 `bloom_add_hash` 把兩個 hash function `djb2`、`jenkins` 加入結構當中。
```cpp=
bloom_t bloom_create(size_t size)
{
bloom_t res = calloc(1, sizeof(struct bloom_filter));
res->size = size;
res->bits = calloc((size + 7) >> 3, 1);
bloom_add_hash(res, djb2);
bloom_add_hash(res, jenkins);
return res;
}
```
#### `bloom_add`
將傳進來的 item 進行 hash function,並更新 `bits` 陣列。
```cpp=
void bloom_add(bloom_t filter, const void *item)
{
struct bloom_hash *h = filter->func;
uint8_t *bits = filter->bits;
while (h) {
unsigned int hash = h->func(item);
hash %= filter->size;
bits[hash >> 3] |= 0x80 >> (hash & 7);
h = h->next;
}
}
```
#### `bloom_test`
測試給定的 item 是否存在,如果 hash 過後的位元都是 1 就代表存在。
```cpp=
bool bloom_test(bloom_t filter, const void *item)
{
struct bloom_hash *h = filter->func;
uint8_t *bits = filter->bits;
while (h) {
unsigned int hash = h->func(item);
hash %= filter->size;
if (!(bits[hash >> 3] & (0x80 >> (hash & 7)))) {
return false;
}
h = h->next;
}
return true;
}
```
## 參考資料
- [Bloom Filter概念和原理](https://blog.csdn.net/jiaomeng/article/details/1495500)