contributed by <river85511
>
ZixinYang
Lubuntu 16.04
4.10.0-37-generic
Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 3072K
TST
ann
。Key
用來記錄 Keys 中的其中一個字元 (包括 EndOfString
)。low
,用來指向比當前的 Node 的 Key 小 的 Node。equal
,用來指向與當前的 Node 的 Key 一樣大 的 Node。high
,用來指向比當前的 Node 的 Key 大 的 Node。觀察在 tst.c
中 tst_node
/** ternary search tree node. */
typedef struct tst_node {
char key;
unsigned refcnt;
struct tst_node *lokid;
struct tst_node *eqkid;
struct tst_node *hikid;
} tst_node;
o
|-key
|-refcnt
------------+------------
|lokid |eqkid |hikid
o o o
refcnt
refcnt
是用來記錄一個 Keys 出現的次數。cat
插入 TST 中,會是如此的結構: o
|-c
|-0
------------+------------
|lokid |eqkid |hikid
o o o
|-a
|-0
----+----
| | | note: any of the lokid or hikid nodes
o can also have pointers to nodes
|-t for words that "cat" or "ca" is
|-0 a partial prefix to.
----+----
| | |
o
|-0
|-1 <== the refcnt is only relevant to the final node
----+----
| | |
NULL o NULL
"cat"
cat
中的 t
的 tst_node
之後多了一個 記錄 0
的 Key
的 tst_node
key
為 0
的 tst_node
用來表示 cat
這個字串的 EndOfString
,以及記錄 cat
這個字串的出現的次數 (因此 refcnt
為 1
)。test_cpy.c
中相對應的程式碼case 'f':
res = tst_search(root, word);
if (res)
printf(" found %s in %.6f sec.\n", (char *) res, t2 - t1);
else
printf(" %s not found.\n", word);
break;
test_cpy.c
中相對應的程式碼case 's':
res = tst_search_prefix(root, word, sgl, &sidx, LMAX);
if (res) {
printf(" %s - searched prefix in %.6f sec\n\n", word, t2 - t1);
for (int i = 0; i < sidx; i++)
printf("suggest[%d] : %s\n", i, sgl[i]);
} else
printf(" %s - not found\n", word);
break;
test_cpy.c
中相對應的部份。case 'a':
res = tst_ins_del(&root, &p, INS, CPY);
if (res) {
idx++;
printf(" %s - inserted in %.6f sec. (%d words in tree)\n",
(char *) res, t2 - t1, idx);
} else
printf(" %s - already exists in list.\n", (char *) res);
break;
test_cpy.c
中相對應的程式碼case 'd':
res = tst_ins_del(&root, &p, DEL, CPY);
if (res)
printf(" delete failed.\n");
else {
printf(" deleted %s in %.6f sec\n", word, t2 - t1);
idx--;
}
break;
test_cpy.c
中相對應的程式碼case 'q':
tst_free_all(root);
return 0;
break;
test_cpy.c
可以發現 TST
的操作在 tst.c
中對應的函式為:
tst_search()
、tst_search_prefix()
tst_ins_del()
TST
:tst_free_all()
TODO: 觀察上述的函式程式碼
tst_search()
tst.c
中的程式碼void *tst_search(const tst_node *p, const char *s)
{
const tst_node *curr = p;
while (curr) { /* loop over each char in 's' */
int diff = *s - curr->key; /* calculate the difference */
if (diff == 0) { /* handle the equal case */
if (*s == 0) /* if *s = curr->key = nul-char, 's' found */
return (void *) curr->eqkid; /* return pointer to 's' */
s++;
curr = curr->eqkid;
} else if (diff < 0) /* handle the less than case */
curr = curr->lokid;
else
curr = curr->hikid; /* handle the greater than case */
}
return NULL;
}
while
迴圈的內容解讀
tst_node
類型的指標 curr
指向的 tst_node
中的 key
會和 待搜索字串中的字元 *s
作比較
*s
> key
則將 curr
指向 curr->hikid
所指向的記憶體位置。*s
< key
則將 curr
指向 curr->lokid
所指向的記憶體位置。*s
= key
*s
為字串中 EOS (EndOfString)
,則表示找到待搜索的字串了,因此回傳 curr->eqkid
的位置。(因為 curr->eqkid
中紀錄著待搜索的字串,如上面提及的 cat
之例子。)*s
不是 EOS
,則繼續在 curr->eqkid
中搜尋字串中的下一個字元(s++
)。curr
為 NULL
為止。tst_search_prefix()
tst.c
中的程式碼void *tst_search_prefix(const tst_node *root,
const char *s,
char **a,
int *n,
const int max)
{
const tst_node *curr = root;
const char *start = s;
if (!*s)
return NULL;
/* get length of s */
for (; *start; start++)
; /* wait */
const size_t nchr = start - s;
start = s; /* reset start to s */
*n = 0; /* initialize n - 0 */
/* Loop while we haven't hit a NULL node or returned */
while (curr) {
int diff = *s - curr->key; /* calculate the difference */
if (diff == 0) { /* handle the equal case */
/* check if prefix number of chars reached */
if ((size_t)(s - start) == nchr - 1) {
/* call tst_suggest to fill a with pointer to matching words */
tst_suggest(curr, curr->key, nchr, a, n, max);
return (void *) curr;
}
if (*s == 0) /* no matching prefix found in tree */
return (void *) curr->eqkid;
s++;
curr = curr->eqkid;
} else if (diff < 0) /* handle the less than case */
curr = curr->lokid;
else
curr = curr->hikid; /* handle the greater than case */
}
return NULL;
}
while
迴圈的部份1. 紀錄字串長度的部份
/* get length of s */
for (; *start; start++)
; /* wait */
const size_t nchr = start - s;
start = s; /* reset start to s */
*n = 0; /* initialize n - 0 */
解讀:
for
迴圈將 start
指向字串的最後一個字元 (EOS
)size_t
型態的變數 nchr
來紀錄最後一個字元到第一個字元(也就是字串的全長)再來是 2. while 迴圈的部份
while (curr) {
int diff = *s - curr->key; /* calculate the difference */
if (diff == 0) { /* handle the equal case */
/* check if prefix number of chars reached */
if ((size_t)(s - start) == nchr - 1) {
/* call tst_suggest to fill a with pointer to matching words */
tst_suggest(curr, curr->key, nchr, a, n, max);
return (void *) curr;
}
if (*s == 0) /* no matching prefix found in tree */
return (void *) curr->eqkid;
s++;
curr = curr->eqkid;
} else if (diff < 0) /* handle the less than case */
curr = curr->lokid;
else
curr = curr->hikid; /* handle the greater than case */
}
while
迴圈和 tst_search()
中的 while
迴圈最大的差別在於:當搜尋到 待搜尋字串的EOS
的前一個字元時,就表示找到相符合的字首了,接著會呼叫 tst_suggest()
來將符合字首要求的字串全部儲存在一個 char
型態的雙指標(double pointer
) a
中。tst_ins_del()
tst.c
中的程式碼void *tst_ins_del(tst_node **root, char *const *s, const int del, const int cpy)
{
int diff;
const char *p = *s;
tst_stack stk = {.data = {NULL}, .idx = 0};
tst_node *curr, **pcurr;
if (!root || !*s)
return NULL; /* validate parameters */
if (strlen(*s) + 1 > STKMAX / 2) /* limit length to 1/2 STKMAX */
return NULL; /* 128 char word lenght is plenty */
pcurr = root; /* start at root */
while ((curr = *pcurr)) { /* iterate to insertion node */
diff = *p - curr->key; /* get ASCII diff for >, <, = */
if (diff == 0) { /* if char equal to node->key */
if (*p++ == 0) { /* check if word is duplicate */
if (del) { /* delete instead of insert */
(curr->refcnt)--; /* decrement reference count */
/* chk refcnt, del 's', return NULL on successful del */
return tst_del_word(root, curr, &stk, 1);
} else
curr->refcnt++; /* increment refcnt if word exists */
return (void *) curr->eqkid; /* pointer to word / NULL on del */
}
pcurr = &(curr->eqkid); /* get next eqkid pointer address */
} else if (diff < 0) { /* if char less than node->key */
pcurr = &(curr->lokid); /* get next lokid pointer address */
} else { /* if char greater than node->key */
pcurr = &(curr->hikid); /* get next hikid pointer address */
}
if (del)
tst_stack_push(&stk, curr); /* push node on stack for del */
}
/* if not duplicate, insert remaining chars into tree rooted at curr */
for (;;) {
/* allocate memory for node, and fill. use calloc (or include
* string.h and initialize w/memset) to avoid valgrind warning
* "Conditional jump or move depends on uninitialised value(s)"
*/
if (!(*pcurr = calloc(1, sizeof **pcurr))) {
fprintf(stderr, "error: tst_insert(), memory exhausted.\n");
return NULL;
}
curr = *pcurr;
curr->key = *p;
curr->refcnt = 1;
curr->lokid = curr->hikid = curr->eqkid = NULL;
if (!*root) /* handle assignment to root if no root */
*root = *pcurr;
/* Place nodes until end of the string, at end of stign allocate
* space for data, copy data as final eqkid, and return.
*/
if (*p++ == 0) {
if (cpy) { /* allocate storage for 's' */
const char *eqdata = strdup(*s);
if (!eqdata)
return NULL;
curr->eqkid = (tst_node *) eqdata;
return (void *) eqdata;
} else { /* save pointer to 's' (allocated elsewhere) */
curr->eqkid = (tst_node *) *s;
return (void *) *s;
}
}
pcurr = &(curr->eqkid);
}
}
while
迴圈for
迴圈1. 檢查字串是否符合規範
if (!root || !*s)
return NULL; /* validate parameters */
if (strlen(*s) + 1 > STKMAX / 2) /* limit length to 1/2 STKMAX */
return NULL; /* 128 char word lenght is plenty */
解讀
STKMAX
在 tst.c
中的定義為 WRDMAX
的兩倍,因此若字串的長度加上 EOS
後比 WRDMAX
大的話,該字串就無法被插入到 TST
中。(註:在 tst.c
中,WRDMAX
被定義為 128
)0
的話,則不插入或刪除。接著是 2. while 迴圈
pcurr = root; /* start at root */
while ((curr = *pcurr)) { /* iterate to insertion node */
diff = *p - curr->key; /* get ASCII diff for >, <, = */
if (diff == 0) { /* if char equal to node->key */
if (*p++ == 0) { /* check if word is duplicate */
if (del) { /* delete instead of insert */
(curr->refcnt)--; /* decrement reference count */
/* chk refcnt, del 's', return NULL on successful del */
return tst_del_word(root, curr, &stk, 1);
} else
curr->refcnt++; /* increment refcnt if word exists */
return (void *) curr->eqkid; /* pointer to word / NULL on del */
}
pcurr = &(curr->eqkid); /* get next eqkid pointer address */
} else if (diff < 0) { /* if char less than node->key */
pcurr = &(curr->lokid); /* get next lokid pointer address */
} else { /* if char greater than node->key */
pcurr = &(curr->hikid); /* get next hikid pointer address */
}
if (del)
tst_stack_push(&stk, curr); /* push node on stack for del */
}
del == 0
del == 1
curr
指向的 tst_node
的 key
和 字串中的字元 *p
的大小
*p
> key
,則讓 pcurr
指向 指向curr->lokid
的指標的記憶體位置。*p
< key
,則讓 pcurr
指向 指向curr->hikid
的指標的記憶體位置。*p
= key
*s++
)為 EOS
時,將curr
指向的 tst_node
的 refcnt
值加一。EOS
,則將 pcurr
指向 指向curr->eqkid
的指標的記憶體位置。EOS
之前 的字元的 Node
的記憶體位置儲存在 stk
中。s++
)為 EOS
時,將 curr
指向的 tst_node
的 refcnt
的值減一,並將 stk
中所儲存的記憶體的內容,透過呼叫 tst_del_word()
來釋放。為什麼在新增或刪除字串時,不是將
curr->eqkid
的refcnt
的值加一或減一?David Kuo
3. for 迴圈
/* if not duplicate, insert remaining chars into tree rooted at curr */
for (;;) {
/* allocate memory for node, and fill. use calloc (or include
* string.h and initialize w/memset) to avoid valgrind warning
* "Conditional jump or move depends on uninitialised value(s)"
*/
if (!(*pcurr = calloc(1, sizeof **pcurr))) {
fprintf(stderr, "error: tst_insert(), memory exhausted.\n");
return NULL;
}
curr = *pcurr;
curr->key = *p;
curr->refcnt = 1;
curr->lokid = curr->hikid = curr->eqkid = NULL;
if (!*root) /* handle assignment to root if no root */
*root = *pcurr;
/* Place nodes until end of the string, at end of stign allocate
* space for data, copy data as final eqkid, and return.
*/
if (*p++ == 0) {
if (cpy) { /* allocate storage for 's' */
const char *eqdata = strdup(*s);
if (!eqdata)
return NULL;
curr->eqkid = (tst_node *) eqdata;
return (void *) eqdata;
} else { /* save pointer to 's' (allocated elsewhere) */
curr->eqkid = (tst_node *) *s;
return (void *) *s;
}
}
pcurr = &(curr->eqkid);
}
TODO: 觀察字典檔
Cities.tst
93827
筆資料。Shanghai, China
A-Z
的大小。TODO: 根據觀察的結果,預期可能會有的問題後,透過測試程式
test_cpy
來確認自己的想法,最後修正程式中的 bug。
$ ./test_cpy
ternary_tree, loaded 259112 words in 0.149612 sec
Commands:
a add word to the tree
f find word in tree
s search words matching prefix
d delete word from the tree
q quit, freeing all data
tst.h
和 tst.c
為何要 Forward Declarationtst.c
裡的函式的功能時,就可以在 tst.h
中很方便的閱讀當初的開發者對於 tst.c
裡各個函式的註解。