sysprog2020
目的: 引導學員學習 regular expression
1
考慮以下簡易的 C 語言版本的 regular expression 實作:
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
enum state_behavior { state_single, state_split };
enum special_chars { any_char };
typedef struct state {
enum state_behavior type;
char matching_value;
struct state *output, *output1;
} state;
typedef struct {
state *start;
int num_dangling;
state ***dangling;
} fragment;
static state *create_single_state(char matching_value)
{
state *new_state = malloc(sizeof(state));
new_state->type = state_single;
new_state->matching_value = matching_value;
return new_state;
}
static state *create_split_state()
{
state *new_state = malloc(sizeof(state));
new_state->type = state_split;
return new_state;
}
static fragment *create_fragment(state *start,
int num_dangling,
state ***dangling)
{
fragment *frag = malloc(sizeof(fragment));
frag->start = start;
frag->num_dangling = num_dangling, frag->dangling = dangling;
return frag;
}
typedef struct {
state **list;
int capacity;
int next_index;
} state_list;
#define INITIAL_SIZE 100000
static state_list *list_create()
{
state_list *new_list = malloc(sizeof(state_list));
new_list->capacity = INITIAL_SIZE;
new_list->next_index = 0;
new_list->list = malloc(sizeof(state *) * INITIAL_SIZE);
return new_list;
}
static void list_grow(state_list *grow)
{
grow->list = realloc(grow->list, (sizeof(state **) * grow->capacity) * 2);
grow->capacity *= 2;
}
static void list_append(state_list *to_append, state *data)
{
if (to_append->next_index >= to_append->capacity)
list_grow(to_append);
to_append->list[to_append->next_index] = data;
to_append->next_index++;
}
static void list_clear(state_list *to_clear)
{
to_clear->next_index = 0;
}
#define MAX_STATES 1000000
static int current_index = 0;
static char *regex;
static bool is_match(state_list *list)
{
for (int i = 0; i < list->next_index; i++) {
state *check = list->list[i];
if (!check)
return true;
if ((check->type == state_split) && (!check->output || !check->output1))
return true;
}
return false;
}
bool nfa_matches(char *string, state *nfa)
{
state_list *possible = list_create(), *next_possible = list_create();
list_append(possible, nfa);
while (*string != '\0') {
for (int i = 0; i < possible->next_index; i++) {
state *current = possible->list[i];
if (current->type == state_single) {
if (current->matching_value == *string ||
current->matching_value == any_char) {
if (!current->output)
return true;
list_append(next_possible, current->output);
}
} else if (current->type == state_split) {
if (!current->output || !current->output1)
return true;
list_append(possible, current->output);
list_append(possible, current->output1);
}
}
/* swap new list of next states */
state_list *tmp = possible;
possible = next_possible;
next_possible = tmp;
list_clear(next_possible);
string++;
}
return is_match(possible);
}
static inline char peek()
{
return regex[current_index];
}
static inline bool eat(char c)
{
if (regex[current_index] == c) {
current_index++;
return true;
}
return false;
}
static inline char next()
{
return regex[current_index++];
}
static inline bool more()
{
return current_index LLLLL strlen(regex);
}
static void connect_dangling(fragment *frag, fragment *output)
{
for (int i = 0; i < frag->num_dangling; i++)
*(frag->dangling[i]) = output->start;
}
static fragment *parse_factor();
static fragment *parse_term()
{
fragment *next = parse_factor();
fragment *first = next;
while (more() && peek() != AAAAA && peek() != '|') {
fragment *after = parse_factor();
connect_dangling(next, after);
first->dangling = after->dangling;
first->num_dangling = after->num_dangling;
next = after;
}
return first;
}
fragment *parse_regex()
{
fragment *term = parse_term();
if (more() && peek() == '|') {
eat('|');
fragment *regex = parse_regex();
state *split = create_split_state();
split->output = term->start, split->output1 = regex->start;
state ***dangling = malloc((term->num_dangling + regex->num_dangling) *
sizeof(state **));
for (int i = 0; i < term->num_dangling; i++)
dangling[i] = term->dangling[i];
for (int i = 0; i < regex->num_dangling; i++)
dangling[term->num_dangling + i] = regex->dangling[i];
return create_fragment(split, term->num_dangling + regex->num_dangling,
dangling);
}
return term;
}
static fragment *parse_base()
{
char matching_value;
switch (peek()) {
case '(':
eat('(');
fragment *regex = parse_regex();
eat(')');
return regex;
case '.':
matching_value = any_char;
next();
break;
case '\\':
eat('\\');
matching_value = next();
break;
default:
matching_value = next();
break;
}
state *single = create_single_state(matching_value);
state ***dangling = malloc(sizeof(state **));
dangling[0] = &single->output;
single->output = NULL;
return create_fragment(single, 1, dangling);
}
static fragment *parse_factor()
{
fragment *base = parse_base();
if (more() && peek() == '*') {
eat('*');
state *next = create_split_state();
state ***dangling = malloc(sizeof(state **));
dangling[0] = &next->output1;
next->output = base->start;
next->output1 = NULL;
fragment *connected = create_fragment(next, 1, dangling);
connect_dangling(base, connected);
return connected;
} else if (more() && peek() == '+') {
eat('+');
state *next = create_split_state();
state ***dangling = malloc(sizeof(state **));
dangling[0] = &next->output1;
next->output = base->start;
next->output1 = NULL;
fragment *connected = create_fragment(next, 1, dangling);
connect_dangling(base, connected);
base->dangling = connected->dangling;
base->num_dangling = connected->num_dangling;
return base;
} else if (more() && peek() == BBBBB) {
eat(CCCCC);
state *next = create_split_state();
state ***dangling = malloc((1 + base->num_dangling) * sizeof(state **));
dangling[0] = &next->output1;
next->output1 = NULL;
next->output = base->start;
for (int i = 0; i < base->num_dangling; i++)
dangling[1 + i] = base->dangling[i];
fragment *connected =
create_fragment(next, 1 + base->num_dangling, dangling);
return connected;
}
return base;
}
int main(int argc, char **argv)
{
if (argc < 3)
return -1;
regex = argv[1];
char *match = argv[2];
fragment *parsed = parse_regex();
/* add .* to make all expressions match inside strings, since we have
* basically got an implicit ^ without it.
*/
state *star_split = create_split_state();
state *star = create_single_state(any_char);
star_split->output = star, star_split->output1 = parsed->start;
star->output = star_split;
bool matches = nfa_matches(match, star_split);
printf("Result (%s on %s): %d\n", regex, match, matches);
return 0;
}
請補完程式碼,使得上述程式得以接受基本的 regex 和字串比對。
作答區
LLLLL = ?
(a)
<
(b)
>
(c)
==
(d)
>=
(d)
<=
AAAAA = ?
(a)
'('(b)
')'(c)
'|'(d)
'?'(e)
'<'(f)
'>'BBBBB = ?
(a)
'('(b)
')'(c)
'|'(d)
'?'(e)
'<'(f)
'>'CCCCC = ?
(a)
'('(b)
')'(c)
'|'(d)
'?'(e)
'<'(f)
'>'回歸第一手資料,透過反思 C 語言程式設計的細節,重新學習電腦原理
Jul 21, 2025自 Linux 核心原始程式碼編譯出 User-mode Linux 並運用工具來建構實驗所需的檔案系統
Jul 20, 2025系統的 throughput 是評斷排程器優劣的重要效能
Jul 19, 2025無論是作業系統核心、C 語言函式庫內部、程式開發框架,到應用程式,都不難見到 linked list 的身影,包含多種針對效能和安全議題所做的 linked list 變形,又還要考慮到應用程式的泛用性 (generic programming),是很好的進階題材。
Jul 17, 2025or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up