# [2021q1](http://wiki.csie.ncku.edu.tw/linux/schedule) 第 14 週測驗題: 測驗 `2` ###### tags: `linux2021` > [測驗題目總覽](https://hackmd.io/@sysprog/linux2021-quiz14) :::info 本題目檢驗學員對 ==[高效 Web 伺服器開發](https://hackmd.io/@sysprog/fast-web-server)== 及 ==[以 sendfile 和 splice 系統呼叫達到 Zero-Copy](https://hackmd.io/@sysprog/linux2020-zerocopy)== 的認知 ::: 以下程式碼嘗試透過「[高效 Web 伺服器開發](https://hackmd.io/@sysprog/fast-web-server)」提到的 epoll 和「[以 sendfile 和 splice 系統呼叫達到 Zero-Copy](https://hackmd.io/@sysprog/linux2020-zerocopy)」提到的 splice 系統呼叫,實作出具體而微的 [Port forwarding](https://en.wikipedia.org/wiki/Port_forwarding)。考慮一個情境:我們對外有一台防火牆,在 DNS 設定方面,我們設定 `ftp.mydomain.com` 及 `www.mydomain.com` 都指向這台防火牆。但我們希望所有 HTTP 連線都重新導向到內部的 `192.168.0.2` 這台機器上,而所有 FTP 連線都交由 `192.168.0.3` 來處理。這時候我們就可以使用 port forwarding 的方式來達成。對應的 NAT (Network Address Translation) 的設定如下: ```= redirect_port tcp 192.168.0.2:80 80 redirect_port tcp 192.168.0.3:20 20 redirect_port tcp 192.168.0.3:21 21 ``` 第 1 行的目的就是將 port 80 的 tcp 連線重新導向到 `192.168.0.2` 的 port 80,而第 2 和第 3 行是將 port 20 及 port 21 的連線交由 `192.168.0.3` 來處理。在 `192.168.0.2` 及 `192.168.0.3` 這二台機器上,我們只要設定它們的 gateway 為防火牆的 IP,例如 `192.168.0.1` 即可。 使用 splice 系統呼叫,我們有機會在網路介面控制器的支援下,達到 Zero-copy 資料傳輸。 原始程式碼可見 `proxy.c`,其 `list.h` 取自 [list.h](https://github.com/sysprog21/linux-list/blob/master/include/list.h),改寫自 Linux 核心原始程式碼。 假設本地機器系統 port 8081 已有網頁伺服器在等待連線。`proxy` 的測試方式為 ```shell $ ./proxy 8082 localhost 8081 ``` 等程式執行後,在另一個終端機畫面中輸入下列命令: ```shell $ telnet localhost 8082 ``` 接著你就可以輸入 HTTP 請求字串,如 `GET /index.html`。 此外,你還可以把 port 8082 轉向到 Google 首頁: 先找出 `www.google.com` 的 IP 地址: ```shell $ nslookup www.google.com ``` 得到以下輸出: ``` Name: www.google.com Address: 172.217.27.132 ``` 修改上述命令: ```shell ./proxy 8082 172.217.27.132 80 ``` 重複上述 `telnet` 命令,這時候就會看到 Google 首頁的字串。 以下是 `proxy.c` 程式碼列表 ```cpp /* Simple port forwarder * Uses pipes to splice two sockets together. This should give something * approaching zero copy, if the NIC driver is capable. * This method is rather file descriptor intensive (4 fds/connection), so * make sure you have enough. */ #define _GNU_SOURCE 1 #include <assert.h> #include <errno.h> #include <fcntl.h> #include <limits.h> #include <netdb.h> #include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <sys/epoll.h> #include <sys/ioctl.h> #include <sys/socket.h> #include <time.h> #include <unistd.h> #include "list.h" #define err(x) perror(x), exit(1) #define NEW(x) ((x) = xmalloc(sizeof(*(x)))) #define MAX(a, b) ((a) > (b) ? (a) : (b)) static int connection_timeout = 5; /* FIXME: configurable */ static void oom(void) { fprintf(stderr, "Out of memory\n"); exit(1); } void *xmalloc(size_t size) { void *p = calloc(size, 1); if (!p) oom(); return p; } void *xrealloc(void *old, size_t size) { void *p = realloc(old, size); if (!p) oom(); return p; } struct addrinfo *resolve(char *name, char *port, int flags) { struct addrinfo *adr; struct addrinfo hint = {.ai_flags = flags}; int ret = getaddrinfo(name, port, &hint, &adr); if (ret) { fprintf(stderr, "proxy: Cannot resolve %s %s: %s\n", name, port, gai_strerror(ret)); exit(1); } return adr; } void setnonblock(int fd, int *cache) { int flags; if (!cache || *cache == -1) { flags = fcntl(fd, F_GETFL, 0); if (cache) *cache = flags; } else flags = *cache; fcntl(fd, F_SETFL, flags | O_NONBLOCK); } struct buffer { int pipe[2]; int bytes; }; struct conn { struct conn *other; int fd; struct buffer buf; time_t expire; struct list_head expire_node; }; LIST_HEAD(expire_list); #define MIN_EVENTS 32 struct epoll_event *events; int num_events, max_events; int epoll_add(int efd, int fd, int revents, void *conn) { struct epoll_event ev = {.events = revents, .data.ptr = conn}; if (EEE >= max_events) { max_events = MAX(max_events * 2, MIN_EVENTS); events = xrealloc(events, sizeof(struct epoll_event) * max_events); } return epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev); } int epoll_del(int efd, int fd) { num_events--; assert(num_events >= 0); return epoll_ctl(efd, EPOLL_CTL_DEL, fd, (void *) 1L); } /* Create buffer between two connections */ struct buffer *newbuffer(struct buffer *buf) { if (pipe2(buf->pipe, O_NONBLOCK) < 0) { perror("pipe"); return NULL; } return buf; } void delbuffer(struct buffer *buf) { close(buf->pipe[0]); close(buf->pipe[1]); } void delconn(int efd, struct conn *conn) { list_del(&conn->expire_node); delbuffer(&conn->buf); epoll_del(efd, conn->fd); close(conn->fd); free(conn); } struct conn *newconn(int efd, int fd, time_t now) { struct conn *conn; NEW(conn); conn->fd = fd; if (!newbuffer(&conn->buf)) { delconn(efd, conn); return NULL; } if (epoll_add(efd, fd, EPOLLIN | EPOLLOUT | EPOLLET, conn) < 0) { perror("epoll"); delconn(efd, conn); return NULL; } conn->expire = now + connection_timeout; list_add_tail(&conn->expire_node, &expire_list); return conn; } /* Process incoming connection. */ void new_request(int efd, int lfd, int *cache, time_t now) { int newsk = accept(lfd, NULL, NULL); if (newsk < 0) { perror("accept"); return; } setnonblock(newsk, cache); newconn(efd, newsk, now); } /* Open outgoing connection */ struct conn *openconn(int efd, struct addrinfo *host, int *cache, struct conn *other, time_t now) { int outfd = socket(host->ai_family, SOCK_STREAM, 0); if (outfd < 0) return NULL; setnonblock(outfd, cache); int n = connect(outfd, host->ai_addr, host->ai_addrlen); if (n < 0 && errno != EINPROGRESS) { perror("connect"); close(outfd); return NULL; } struct conn *conn = newconn(efd, outfd, now); if (conn) { conn->other = other; other->other = conn; } return conn; } #define BUFSZ 16384 /* FIXME: configurable */ /* Move from socket to pipe */ bool move_data_in(int srcfd, struct buffer *buf) { for (;;) { int n = splice(srcfd, NULL, buf->pipe[1], NULL, BUFSZ, SPLICE_F_NONBLOCK | SPLICE_F_MOVE); if (n > 0) buf->bytes += n; if (n == 0) return false; if (n < 0) { if (errno == EAGAIN || errno == EWOULDBLOCK) return true; return false; } } return true; } /* From pipe to socket */ bool move_data_out(struct buffer *buf, int dstfd) { while (buf->bytes > 0) { int bytes = buf->bytes; if (bytes > BUFSZ) bytes = BUFSZ; int n = splice(buf->pipe[0], NULL, dstfd, NULL, bytes, SPLICE_F_NONBLOCK | SPLICE_F_MOVE); if (n == 0) break; if (n < 0) { if (errno == EAGAIN || errno == EWOULDBLOCK) break; return false; } FFF; } /* bytes > 0, add dst to epoll set. Otherwise, remove if it was added */ return true; } void closeconn(int efd, struct conn *conn) { if (conn->other) delconn(efd, conn->other); delconn(efd, conn); } int expire_connections(int efd, time_t now) { struct conn *conn, *tmp; list_for_each_entry_safe (conn, tmp, &expire_list, expire_node) { if (conn->expire > now) return (conn->expire - now) * 1000; closeconn(efd, conn); } return -1; } void touch_conn(struct conn *conn, time_t now) { conn->expire = now + connection_timeout; list_del(&conn->expire_node); list_add_tail(&conn->expire_node, &expire_list); } int listen_socket(int efd, char *lname, char *port) { struct addrinfo *laddr = resolve(lname, port, AI_PASSIVE); int lfd = socket(laddr->ai_family, SOCK_STREAM, 0); if (lfd < 0) err("socket"); int opt = 1; if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(int)) < 0) err("SO_REUSEADDR"); if (bind(lfd, laddr->ai_addr, laddr->ai_addrlen) < 0) err("bind"); if (listen(lfd, 20) < 0) err("listen"); setnonblock(lfd, NULL); freeaddrinfo(laddr); if (epoll_add(efd, lfd, EPOLLIN, NULL) < 0) err("epoll add listen fd"); return lfd; } int main(int ac, char **av) { if (ac != 4 && ac != 5) { fprintf(stderr, "Usage: proxy inport outhost outport [listenaddr]\n"); exit(1); } struct addrinfo *outhost = resolve(av[2], av[3], 0); int efd = epoll_create(10); if (efd < 0) err("epoll_create"); int lfd = listen_socket(efd, av[4] ? av[4] : "0.0.0.0", av[1]); int cache_in = -1, cache_out = -1; int timeo = -1; for (;;) { int nfds = epoll_wait(efd, events, num_events, timeo); if (nfds < 0) { perror("epoll"); continue; } time_t now = time(NULL); for (int i = 0; i < nfds; i++) { struct epoll_event *ev = &events[i]; struct conn *conn = ev->data.ptr; /* listen socket */ if (!conn) { if (ev->events & EPOLLIN) new_request(efd, lfd, &cache_in, now); continue; } if (ev->events & (EPOLLERR | EPOLLHUP)) { closeconn(efd, conn); continue; } struct conn *other = conn->other; /* No attempt for partial close right now */ if (ev->events & EPOLLIN) { touch_conn(conn, now); if (!other) other = openconn(efd, outhost, &cache_out, conn, now); bool in = move_data_in(conn->fd, &conn->buf); bool out = move_data_out(&conn->buf, other->fd); if (!in || !out) { closeconn(efd, conn); continue; } touch_conn(other, now); } if ((ev->events & EPOLLOUT) && other) { if (!move_data_out(&other->buf, conn->fd)) delconn(efd, conn); else touch_conn(conn, now); /* When the pipe filled up could have lost input events. * Unfortunately, splice does not tell us which end was * responsible for 0, so we have to ask explicitely. */ int len = 0; if (ioctl(other->fd, FIONREAD, &len) < 0) perror("ioctl"); if (len > 0) { if (!move_data_in(other->fd, &other->buf)) closeconn(efd, other); } } } timeo = expire_connections(efd, now); } return 0; } ``` 請補完程式碼。 ==作答區== EEE = ? * `(a)` `num_events` * `(b)` `num_events++` * `(c)` `num_events--` * `(d)` `++num_events` * `(e)` `--num_events` FFF = ? * `(a)` 不需要加入程式碼 * `(b)` `buf->bytes += n` * `(c)` `buf->bytes -= n` * `(d)` `buf->bytes--` * `(e)` `buf->bytes++` :::success 延伸問題: 1. 解釋上述程式碼運作原理,指出實作缺陷並改進 2. 將 DNS 解析的機制納入,允許用 [FQDN](https://en.wikipedia.org/wiki/Fully_qualified_domain_name) 作為輸入 3. 嘗試克服連線數量的限制,設計實驗並探討如此 port forwarding 的效率 :::