# [2020q3](http://wiki.csie.ncku.edu.tw/sysprog/schedule) 第 17 週測驗題 ###### tags: `sysprog2020` :::info 目的: 引導學員學習 regular expression ::: ==[作答表單](https://docs.google.com/forms/d/e/1FAIpQLScxYAWqxmUxhwoinNNUvJ8RfaJkbFwaVyVYZVTKvmRDCdTwjg/viewform)== --- ### 測驗 `1` 考慮以下簡易的 C 語言版本的 regular expression 實作: ```cpp #include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> enum state_behavior { state_single, state_split }; enum special_chars { any_char }; typedef struct state { enum state_behavior type; char matching_value; struct state *output, *output1; } state; typedef struct { state *start; int num_dangling; state ***dangling; } fragment; static state *create_single_state(char matching_value) { state *new_state = malloc(sizeof(state)); new_state->type = state_single; new_state->matching_value = matching_value; return new_state; } static state *create_split_state() { state *new_state = malloc(sizeof(state)); new_state->type = state_split; return new_state; } static fragment *create_fragment(state *start, int num_dangling, state ***dangling) { fragment *frag = malloc(sizeof(fragment)); frag->start = start; frag->num_dangling = num_dangling, frag->dangling = dangling; return frag; } typedef struct { state **list; int capacity; int next_index; } state_list; #define INITIAL_SIZE 100000 static state_list *list_create() { state_list *new_list = malloc(sizeof(state_list)); new_list->capacity = INITIAL_SIZE; new_list->next_index = 0; new_list->list = malloc(sizeof(state *) * INITIAL_SIZE); return new_list; } static void list_grow(state_list *grow) { grow->list = realloc(grow->list, (sizeof(state **) * grow->capacity) * 2); grow->capacity *= 2; } static void list_append(state_list *to_append, state *data) { if (to_append->next_index >= to_append->capacity) list_grow(to_append); to_append->list[to_append->next_index] = data; to_append->next_index++; } static void list_clear(state_list *to_clear) { to_clear->next_index = 0; } #define MAX_STATES 1000000 static int current_index = 0; static char *regex; static bool is_match(state_list *list) { for (int i = 0; i < list->next_index; i++) { state *check = list->list[i]; if (!check) return true; if ((check->type == state_split) && (!check->output || !check->output1)) return true; } return false; } bool nfa_matches(char *string, state *nfa) { state_list *possible = list_create(), *next_possible = list_create(); list_append(possible, nfa); while (*string != '\0') { for (int i = 0; i < possible->next_index; i++) { state *current = possible->list[i]; if (current->type == state_single) { if (current->matching_value == *string || current->matching_value == any_char) { if (!current->output) return true; list_append(next_possible, current->output); } } else if (current->type == state_split) { if (!current->output || !current->output1) return true; list_append(possible, current->output); list_append(possible, current->output1); } } /* swap new list of next states */ state_list *tmp = possible; possible = next_possible; next_possible = tmp; list_clear(next_possible); string++; } return is_match(possible); } static inline char peek() { return regex[current_index]; } static inline bool eat(char c) { if (regex[current_index] == c) { current_index++; return true; } return false; } static inline char next() { return regex[current_index++]; } static inline bool more() { return current_index LLLLL strlen(regex); } static void connect_dangling(fragment *frag, fragment *output) { for (int i = 0; i < frag->num_dangling; i++) *(frag->dangling[i]) = output->start; } static fragment *parse_factor(); static fragment *parse_term() { fragment *next = parse_factor(); fragment *first = next; while (more() && peek() != AAAAA && peek() != '|') { fragment *after = parse_factor(); connect_dangling(next, after); first->dangling = after->dangling; first->num_dangling = after->num_dangling; next = after; } return first; } fragment *parse_regex() { fragment *term = parse_term(); if (more() && peek() == '|') { eat('|'); fragment *regex = parse_regex(); state *split = create_split_state(); split->output = term->start, split->output1 = regex->start; state ***dangling = malloc((term->num_dangling + regex->num_dangling) * sizeof(state **)); for (int i = 0; i < term->num_dangling; i++) dangling[i] = term->dangling[i]; for (int i = 0; i < regex->num_dangling; i++) dangling[term->num_dangling + i] = regex->dangling[i]; return create_fragment(split, term->num_dangling + regex->num_dangling, dangling); } return term; } static fragment *parse_base() { char matching_value; switch (peek()) { case '(': eat('('); fragment *regex = parse_regex(); eat(')'); return regex; case '.': matching_value = any_char; next(); break; case '\\': eat('\\'); matching_value = next(); break; default: matching_value = next(); break; } state *single = create_single_state(matching_value); state ***dangling = malloc(sizeof(state **)); dangling[0] = &single->output; single->output = NULL; return create_fragment(single, 1, dangling); } static fragment *parse_factor() { fragment *base = parse_base(); if (more() && peek() == '*') { eat('*'); state *next = create_split_state(); state ***dangling = malloc(sizeof(state **)); dangling[0] = &next->output1; next->output = base->start; next->output1 = NULL; fragment *connected = create_fragment(next, 1, dangling); connect_dangling(base, connected); return connected; } else if (more() && peek() == '+') { eat('+'); state *next = create_split_state(); state ***dangling = malloc(sizeof(state **)); dangling[0] = &next->output1; next->output = base->start; next->output1 = NULL; fragment *connected = create_fragment(next, 1, dangling); connect_dangling(base, connected); base->dangling = connected->dangling; base->num_dangling = connected->num_dangling; return base; } else if (more() && peek() == BBBBB) { eat(CCCCC); state *next = create_split_state(); state ***dangling = malloc((1 + base->num_dangling) * sizeof(state **)); dangling[0] = &next->output1; next->output1 = NULL; next->output = base->start; for (int i = 0; i < base->num_dangling; i++) dangling[1 + i] = base->dangling[i]; fragment *connected = create_fragment(next, 1 + base->num_dangling, dangling); return connected; } return base; } int main(int argc, char **argv) { if (argc < 3) return -1; regex = argv[1]; char *match = argv[2]; fragment *parsed = parse_regex(); /* add .* to make all expressions match inside strings, since we have * basically got an implicit ^ without it. */ state *star_split = create_split_state(); state *star = create_single_state(any_char); star_split->output = star, star_split->output1 = parsed->start; star->output = star_split; bool matches = nfa_matches(match, star_split); printf("Result (%s on %s): %d\n", regex, match, matches); return 0; } ``` 請補完程式碼,使得上述程式得以接受基本的 regex 和字串比對。 ==作答區== LLLLL = ? * `(a)` `<` * `(b)` `>` * `(c)` `==` * `(d)` `>=` * `(d)` `<=` AAAAA = ? * `(a)` '(' * `(b)` ')' * `(c)` '|' * `(d)` '?' * `(e)` '<' * `(f)` '>' BBBBB = ? * `(a)` '(' * `(b)` ')' * `(c)` '|' * `(d)` '?' * `(e)` '<' * `(f)` '>' CCCCC = ? * `(a)` '(' * `(b)` ')' * `(c)` '|' * `(d)` '?' * `(e)` '<' * `(f)` '>'