Try   HackMD

2022q1 Homework6 (ktcp)

contributed by < hankluo6 >

kecho

傳遞參數到核心模組

insmod 最後會呼叫到 load_module,在 load_module 中會先透過 find_module_sections 設置 module 的指標到對應的 section,接著 parse_args 便能將對應的參數寫入。

static int find_module_sections(struct module *mod, struct load_info *info)
{
    mod->kp = section_objs(info, "__param",
			       sizeof(*mod->kp), &mod->num_kp);
    ...
}

find_module_sections 中,struct module 是代表核心模組主要的結構,將 mod->kc 指向 __param 這個 section,並設置 mod->num_kp 的值。

/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
char *parse_args(const char *doing,
		 char *args,
		 const struct kernel_param *params,
		 unsigned num,
		 s16 min_level,
		 s16 max_level,
		 void *arg,
		 int (*unknown)(char *param, char *val,
				const char *doing, void *arg))
{
    ... 
    while (*args) {
        int ret;
        int irq_was_disabled;

        args = next_arg(args, &param, &val);
        /* Stop at -- */
        if (!val && strcmp(param, "--") == 0)
            return err ?: args;
        ret = parse_one(param, val, doing, params, num,
                min_level, max_level, arg, unknown);
        ...

    }

    return err;
}

static int parse_one(char *param,
		     char *val,
		     const char *doing,
		     const struct kernel_param *params,
		     unsigned num_params,
		     s16 min_level,
		     s16 max_level,
		     void *arg,
		     int (*handle_unknown)(char *param, char *val,
				     const char *doing, void *arg))
{
    unsigned int i;
    int err;

    /* Find parameter */
    for (i = 0; i < num_params; i++) {
        if (parameq(param, params[i].name)) {
            if (param_check_unsafe(&params[i]))
                err = params[i].ops->set(val, &params[i]);
            else
                err = -EPERM;
            return err;
        }
    }
    ...
}

parse_args 將使用者輸入的每個參數都傳到 parse_one 檢查,parse_one 將使用者提供的參數名字與 __param section 內的所有參數名字比對,如果相同則透過 params[i].ops->set 設置參數。

所以 module_param 應該要將參數植入到 __param section,並提供對應的 ops

#define module_param(name, type, perm)				\
	module_param_named(name, name, type, perm)

#define module_param_named(name, value, type, perm)			   \
	param_check_##type(name, &(value));				   \
	module_param_cb(name, &param_ops_##type, &value, perm);		   \
	__MODULE_PARM_TYPE(name, #type)

#define module_param_cb(name, ops, arg, perm)				      \
	__module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, -1, 0)

#define __module_param_call(prefix, name, ops, arg, perm, level, flags)	\
	/* Default value instead of permissions? */			\
	static const char __param_str_##name[] = prefix #name;		\
	static struct kernel_param __moduleparam_const __param_##name	\
	__used __section("__param")					\
	__aligned(__alignof__(struct kernel_param))			\
	= { __param_str_##name, THIS_MODULE, ops,			\
	    VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }

可以看到最後會透過 __module_param_call 展開,其中 __used __section("__param") 證明了會將這些參數放置在 __param section。而實際設置參數 ops->set 函式的 ops 會在 module_param_named 時設置成 &param_ops_##type 傳入,param_ops_##type 會被展開成 param_ops_ushortparam_ops_bool 等預先定義好的 type,其中也有定義好每個 type 對應的 set 操作,透過對應的 set 函式便能將資料寫入。

user-echo-server 運作原理

int main(void)
{
    static struct epoll_event events[EPOLL_SIZE];
    struct sockaddr_in addr = {
        .sin_family = PF_INET,
        .sin_port = htons(SERVER_PORT),
        .sin_addr.s_addr = htonl(INADDR_ANY),
    };
    socklen_t socklen = sizeof(addr);

    client_list_t *list = NULL;
    int listener;
    if ((listener = socket(PF_INET, SOCK_STREAM, 0)) < 0)
        server_err("Fail to create socket", &list);
    printf("Main listener (fd=%d) was created.\n", listener);

    if (setnonblock(listener) == -1)
        server_err("Fail to set nonblocking", &list);
    if (bind(listener, (struct sockaddr *) &addr, sizeof(addr)) < 0)
        server_err("Fail to bind", &list);
    printf("Listener was binded to %s\n", inet_ntoa(addr.sin_addr));
    
    if (listen(listener, 128) < 0)
        server_err("Fail to listen", &list);

    ...
}

透過 socket(2) 建立 socket,setnonblock 將這個 sockeet 設置為 no blocking,bind(2) 將 socket 與 address 綁定在一起。

listen(2) 將 socket 開始監聽,使其能夠接收 client 端的請求連線。TCP 在 kernel 中會維護兩個 queue,第一個 queue 用來儲存正在進行 three-way handshaking 中的 request,第二個 queue 會存放已經處理好在等待 accept 的 request,而 backlog 參數指的是可以指定這兩個 queue 的總和大小。

    ...
        
    int epoll_fd;
    if ((epoll_fd = epoll_create(EPOLL_SIZE)) < 0)
        server_err("Fail to create epoll", &list);

    static struct epoll_event ev = {.events = EPOLLIN | EPOLLET};
    ev.data.fd = listener;
    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listener, &ev) < 0)
        server_err("Fail to control epoll", &list);

    while (1) {
        struct sockaddr_in client_addr;
        int epoll_events_count;
        if ((epoll_events_count = epoll_wait(epoll_fd, events, EPOLL_SIZE,
                                             EPOLL_RUN_TIMEOUT)) < 0)
            server_err("Fail to wait epoll", &list);
        for (int i = 0; i < epoll_events_count; i++) {
            /* EPOLLIN event for listener (new client connection) */
            if (events[i].data.fd == listener) {
                int client;
                while (
                    (client = accept(listener, (struct sockaddr *) &client_addr,
                                     &socklen)) > 0) {
                    setnonblock(client);
                    ev.data.fd = client;
                    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, client, &ev) < 0)
                        server_err("Fail to control epoll", &list);
                    push_back_client(&list, client,
                                     inet_ntoa(client_addr.sin_addr));
                }
            } else {
                /* EPOLLIN event for others (new incoming message from client)
                 */
                if (handle_message_from_client(events[i].data.fd, &list) < 0)
                    server_err("Handle message from client", &list);
            }
        }
    }
    ...

epoll(7) 可以同時監控多個 file descriptions 並判斷哪些 fd 有資料可寫入或讀取:

  • epoll_create(2) creates a new epoll instance and returns a file
    descriptor referring to that instance. (The more recent
    epoll_create1(2) extends the functionality of epoll_create(2).)

  • Interest in particular file descriptors is then registered via
    epoll_ctl(2), which adds items to the interest list of the
    epoll instance.

  • epoll_wait(2) waits for I/O events, blocking the calling thread
    if no events are currently available. (This system call can be
    thought of as fetching items from the ready list of the epoll
    instance.)

將要監聽的 listener 透過 epoll_ctl 註冊到 epoll 當中,epoll_wait 當有 client 連接時,便會回傳。在 for 迴圈中,透過 accept(2) 將這個 client 連接,並將 client 的 fd 透過 epoll_ctl 放到 epoll 中等待接收資料。而 push_back_client 透過 linked list 紀錄現在有哪些 client 連接。

epoll_wait 回傳的 events 不是 listener 時,表示 client 端有資料傳入,進入 handle_message_from_client 處理。

static int handle_message_from_client(int client, client_list_t **list)
{
    int len;
    char buf[BUF_SIZE];
    memset(buf, 0, BUF_SIZE);
    if ((len = recv(client, buf, BUF_SIZE, 0)) < 0)
        server_err("Fail to receive", list);
    if (len == 0) {
        if (close(client) < 0)
            server_err("Fail to close", list);
        *list = delete_client(list, client);
        printf("After fd=%d is closed, current numbers clients = %d\n", client,
               size_list(*list));
    } else {
        printf("Client #%d :> %s", client, buf);
        if (send(client, buf, BUF_SIZE, 0) < 0)
            server_err("Fail to send", list);
    }
    return len;
}

handle_message_from_client 透過 recvsend 將 client 傳來的資料回傳,如果 len 為 0,表示沒有資料傳入,可以認為 client 端以被關閉,利用 delete_client 將 client 從 linked list 中移除。

bench.c

static void *bench_worker(__attribute__((unused)))
{
    int sock_fd;
    char dummy[MAX_MSG_LEN];
    struct timeval start, end;

    /* wait until all workers created */
    pthread_mutex_lock(&worker_lock);
    while (!ready)
        if (pthread_cond_wait(&worker_wait, &worker_lock)) {
            puts("pthread_cond_wait failed");
            exit(-1);
        }
    pthread_mutex_unlock(&worker_lock);

    sock_fd = socket(AF_INET, SOCK_STREAM, 0);
    if (sock_fd == -1) {
        perror("socket");
        exit(-1);
    }

    struct sockaddr_in info = {
        .sin_family = PF_INET,
        .sin_addr.s_addr = inet_addr(TARGET_HOST),
        .sin_port = htons(TARGET_PORT),
    };

    if (connect(sock_fd, (struct sockaddr *) &info, sizeof(info)) == -1) {
        perror("connect");
        exit(-1);
    }

    gettimeofday(&start, NULL);
    send(sock_fd, msg_dum, strlen(msg_dum), 0);
    recv(sock_fd, dummy, MAX_MSG_LEN, 0);
    gettimeofday(&end, NULL);

    shutdown(sock_fd, SHUT_RDWR);
    close(sock_fd);

    if (strncmp(msg_dum, dummy, strlen(msg_dum))) {
        puts("echo message validation failed");
        exit(-1);
    }

    pthread_mutex_lock(&res_lock);
    time_res[idx++] += time_diff_us(&start, &end);
    pthread_mutex_unlock(&res_lock);
bench_worker
    pthread_exit(NULL);
}

static void bench(void)
{
    for (int i = 0; i < BENCH_COUNT; i++) {
        ready = false;

        create_worker(MAX_THREAD);

        pthread_mutex_lock(&worker_lock);

        ready = true;

        /* all workers are ready, let's start bombing kecho */
        pthread_cond_broadcast(&worker_wait);

        pthread_mutex_unlock(&worker_lock);

        /* waiting for all workers to finish the measurement */
        for (int x = 0; x < MAX_THREAD; x++)
            pthread_join(pt[x], NULL);

        idx = 0;
    }

    for (int i = 0; i < MAX_THREAD; i++)
        fprintf(bench_fd, "%d %ld\n", i, time_res[i] /= BENCH_COUNT);
}

create_worker 會透過 pthread_create 建立執行緒執行 bench_worker 函式,而為了確保每個執行緒能在相同的時間執行,pthread_cond_wait 讓每個執行緒等待 condition 發生,建議完執行緒後,pthread_cond_broadcast 喚醒所有等待的執行緒。bench_worker 便開始建立 TCP 連線,gettimeofday 紀錄 recvsend 的時間。


kecho 的速度比 user-echo-server 快將近 10 倍左右,這是因為 user space 的系統呼叫 (如 listen, accept) 最後也會呼叫到 kernel 內的 kernel_listenkernel_accept

系統呼叫的成本雖然整體來說持續降低,但終究無法消弭,我們今年發表 Effective System Call Aggregation (ESCA) 來降低系統呼叫過程的 CPU mode switch 的衝擊。

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →
jserv


tags: linux2022