# [2021q1](http://wiki.csie.ncku.edu.tw/linux/schedule) 第 14 週測驗題: 測驗 `2`
###### tags: `linux2021`
> [測驗題目總覽](https://hackmd.io/@sysprog/linux2021-quiz14)
:::info
本題目檢驗學員對 ==[高效 Web 伺服器開發](https://hackmd.io/@sysprog/fast-web-server)== 及 ==[以 sendfile 和 splice 系統呼叫達到 Zero-Copy](https://hackmd.io/@sysprog/linux2020-zerocopy)== 的認知
:::
以下程式碼嘗試透過「[高效 Web 伺服器開發](https://hackmd.io/@sysprog/fast-web-server)」提到的 epoll 和「[以 sendfile 和 splice 系統呼叫達到 Zero-Copy](https://hackmd.io/@sysprog/linux2020-zerocopy)」提到的 splice 系統呼叫,實作出具體而微的 [Port forwarding](https://en.wikipedia.org/wiki/Port_forwarding)。考慮一個情境:我們對外有一台防火牆,在 DNS 設定方面,我們設定 `ftp.mydomain.com` 及 `www.mydomain.com` 都指向這台防火牆。但我們希望所有 HTTP 連線都重新導向到內部的 `192.168.0.2` 這台機器上,而所有 FTP 連線都交由 `192.168.0.3` 來處理。這時候我們就可以使用 port forwarding 的方式來達成。對應的 NAT (Network Address Translation) 的設定如下:
```=
redirect_port tcp 192.168.0.2:80 80
redirect_port tcp 192.168.0.3:20 20
redirect_port tcp 192.168.0.3:21 21
```
第 1 行的目的就是將 port 80 的 tcp 連線重新導向到 `192.168.0.2` 的 port 80,而第 2 和第 3 行是將 port 20 及 port 21 的連線交由 `192.168.0.3` 來處理。在 `192.168.0.2` 及 `192.168.0.3` 這二台機器上,我們只要設定它們的 gateway 為防火牆的 IP,例如 `192.168.0.1` 即可。
使用 splice 系統呼叫,我們有機會在網路介面控制器的支援下,達到 Zero-copy 資料傳輸。
原始程式碼可見 `proxy.c`,其 `list.h` 取自 [list.h](https://github.com/sysprog21/linux-list/blob/master/include/list.h),改寫自 Linux 核心原始程式碼。
假設本地機器系統 port 8081 已有網頁伺服器在等待連線。`proxy` 的測試方式為
```shell
$ ./proxy 8082 localhost 8081
```
等程式執行後,在另一個終端機畫面中輸入下列命令:
```shell
$ telnet localhost 8082
```
接著你就可以輸入 HTTP 請求字串,如 `GET /index.html`。
此外,你還可以把 port 8082 轉向到 Google 首頁:
先找出 `www.google.com` 的 IP 地址:
```shell
$ nslookup www.google.com
```
得到以下輸出:
```
Name: www.google.com
Address: 172.217.27.132
```
修改上述命令:
```shell
./proxy 8082 172.217.27.132 80
```
重複上述 `telnet` 命令,這時候就會看到 Google 首頁的字串。
以下是 `proxy.c` 程式碼列表
```cpp
/* Simple port forwarder
* Uses pipes to splice two sockets together. This should give something
* approaching zero copy, if the NIC driver is capable.
* This method is rather file descriptor intensive (4 fds/connection), so
* make sure you have enough.
*/
#define _GNU_SOURCE 1
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <netdb.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/epoll.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <time.h>
#include <unistd.h>
#include "list.h"
#define err(x) perror(x), exit(1)
#define NEW(x) ((x) = xmalloc(sizeof(*(x))))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
static int connection_timeout = 5; /* FIXME: configurable */
static void oom(void)
{
fprintf(stderr, "Out of memory\n");
exit(1);
}
void *xmalloc(size_t size)
{
void *p = calloc(size, 1);
if (!p) oom();
return p;
}
void *xrealloc(void *old, size_t size)
{
void *p = realloc(old, size);
if (!p) oom();
return p;
}
struct addrinfo *resolve(char *name, char *port, int flags)
{
struct addrinfo *adr;
struct addrinfo hint = {.ai_flags = flags};
int ret = getaddrinfo(name, port, &hint, &adr);
if (ret) {
fprintf(stderr, "proxy: Cannot resolve %s %s: %s\n", name, port,
gai_strerror(ret));
exit(1);
}
return adr;
}
void setnonblock(int fd, int *cache)
{
int flags;
if (!cache || *cache == -1) {
flags = fcntl(fd, F_GETFL, 0);
if (cache) *cache = flags;
} else
flags = *cache;
fcntl(fd, F_SETFL, flags | O_NONBLOCK);
}
struct buffer {
int pipe[2];
int bytes;
};
struct conn {
struct conn *other;
int fd;
struct buffer buf;
time_t expire;
struct list_head expire_node;
};
LIST_HEAD(expire_list);
#define MIN_EVENTS 32
struct epoll_event *events;
int num_events, max_events;
int epoll_add(int efd, int fd, int revents, void *conn)
{
struct epoll_event ev = {.events = revents, .data.ptr = conn};
if (EEE >= max_events) {
max_events = MAX(max_events * 2, MIN_EVENTS);
events = xrealloc(events, sizeof(struct epoll_event) * max_events);
}
return epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev);
}
int epoll_del(int efd, int fd)
{
num_events--;
assert(num_events >= 0);
return epoll_ctl(efd, EPOLL_CTL_DEL, fd, (void *) 1L);
}
/* Create buffer between two connections */
struct buffer *newbuffer(struct buffer *buf)
{
if (pipe2(buf->pipe, O_NONBLOCK) < 0) {
perror("pipe");
return NULL;
}
return buf;
}
void delbuffer(struct buffer *buf)
{
close(buf->pipe[0]);
close(buf->pipe[1]);
}
void delconn(int efd, struct conn *conn)
{
list_del(&conn->expire_node);
delbuffer(&conn->buf);
epoll_del(efd, conn->fd);
close(conn->fd);
free(conn);
}
struct conn *newconn(int efd, int fd, time_t now)
{
struct conn *conn;
NEW(conn);
conn->fd = fd;
if (!newbuffer(&conn->buf)) {
delconn(efd, conn);
return NULL;
}
if (epoll_add(efd, fd, EPOLLIN | EPOLLOUT | EPOLLET, conn) < 0) {
perror("epoll");
delconn(efd, conn);
return NULL;
}
conn->expire = now + connection_timeout;
list_add_tail(&conn->expire_node, &expire_list);
return conn;
}
/* Process incoming connection. */
void new_request(int efd, int lfd, int *cache, time_t now)
{
int newsk = accept(lfd, NULL, NULL);
if (newsk < 0) {
perror("accept");
return;
}
setnonblock(newsk, cache);
newconn(efd, newsk, now);
}
/* Open outgoing connection */
struct conn *openconn(int efd,
struct addrinfo *host,
int *cache,
struct conn *other,
time_t now)
{
int outfd = socket(host->ai_family, SOCK_STREAM, 0);
if (outfd < 0) return NULL;
setnonblock(outfd, cache);
int n = connect(outfd, host->ai_addr, host->ai_addrlen);
if (n < 0 && errno != EINPROGRESS) {
perror("connect");
close(outfd);
return NULL;
}
struct conn *conn = newconn(efd, outfd, now);
if (conn) {
conn->other = other;
other->other = conn;
}
return conn;
}
#define BUFSZ 16384 /* FIXME: configurable */
/* Move from socket to pipe */
bool move_data_in(int srcfd, struct buffer *buf)
{
for (;;) {
int n = splice(srcfd, NULL, buf->pipe[1], NULL, BUFSZ,
SPLICE_F_NONBLOCK | SPLICE_F_MOVE);
if (n > 0) buf->bytes += n;
if (n == 0) return false;
if (n < 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK) return true;
return false;
}
}
return true;
}
/* From pipe to socket */
bool move_data_out(struct buffer *buf, int dstfd)
{
while (buf->bytes > 0) {
int bytes = buf->bytes;
if (bytes > BUFSZ) bytes = BUFSZ;
int n = splice(buf->pipe[0], NULL, dstfd, NULL, bytes,
SPLICE_F_NONBLOCK | SPLICE_F_MOVE);
if (n == 0) break;
if (n < 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK) break;
return false;
}
FFF;
}
/* bytes > 0, add dst to epoll set. Otherwise, remove if it was added */
return true;
}
void closeconn(int efd, struct conn *conn)
{
if (conn->other) delconn(efd, conn->other);
delconn(efd, conn);
}
int expire_connections(int efd, time_t now)
{
struct conn *conn, *tmp;
list_for_each_entry_safe (conn, tmp, &expire_list, expire_node) {
if (conn->expire > now) return (conn->expire - now) * 1000;
closeconn(efd, conn);
}
return -1;
}
void touch_conn(struct conn *conn, time_t now)
{
conn->expire = now + connection_timeout;
list_del(&conn->expire_node);
list_add_tail(&conn->expire_node, &expire_list);
}
int listen_socket(int efd, char *lname, char *port)
{
struct addrinfo *laddr = resolve(lname, port, AI_PASSIVE);
int lfd = socket(laddr->ai_family, SOCK_STREAM, 0);
if (lfd < 0) err("socket");
int opt = 1;
if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(int)) < 0)
err("SO_REUSEADDR");
if (bind(lfd, laddr->ai_addr, laddr->ai_addrlen) < 0) err("bind");
if (listen(lfd, 20) < 0) err("listen");
setnonblock(lfd, NULL);
freeaddrinfo(laddr);
if (epoll_add(efd, lfd, EPOLLIN, NULL) < 0) err("epoll add listen fd");
return lfd;
}
int main(int ac, char **av)
{
if (ac != 4 && ac != 5) {
fprintf(stderr, "Usage: proxy inport outhost outport [listenaddr]\n");
exit(1);
}
struct addrinfo *outhost = resolve(av[2], av[3], 0);
int efd = epoll_create(10);
if (efd < 0) err("epoll_create");
int lfd = listen_socket(efd, av[4] ? av[4] : "0.0.0.0", av[1]);
int cache_in = -1, cache_out = -1;
int timeo = -1;
for (;;) {
int nfds = epoll_wait(efd, events, num_events, timeo);
if (nfds < 0) {
perror("epoll");
continue;
}
time_t now = time(NULL);
for (int i = 0; i < nfds; i++) {
struct epoll_event *ev = &events[i];
struct conn *conn = ev->data.ptr;
/* listen socket */
if (!conn) {
if (ev->events & EPOLLIN) new_request(efd, lfd, &cache_in, now);
continue;
}
if (ev->events & (EPOLLERR | EPOLLHUP)) {
closeconn(efd, conn);
continue;
}
struct conn *other = conn->other;
/* No attempt for partial close right now */
if (ev->events & EPOLLIN) {
touch_conn(conn, now);
if (!other)
other = openconn(efd, outhost, &cache_out, conn, now);
bool in = move_data_in(conn->fd, &conn->buf);
bool out = move_data_out(&conn->buf, other->fd);
if (!in || !out) {
closeconn(efd, conn);
continue;
}
touch_conn(other, now);
}
if ((ev->events & EPOLLOUT) && other) {
if (!move_data_out(&other->buf, conn->fd))
delconn(efd, conn);
else
touch_conn(conn, now);
/* When the pipe filled up could have lost input events.
* Unfortunately, splice does not tell us which end was
* responsible for 0, so we have to ask explicitely.
*/
int len = 0;
if (ioctl(other->fd, FIONREAD, &len) < 0) perror("ioctl");
if (len > 0) {
if (!move_data_in(other->fd, &other->buf))
closeconn(efd, other);
}
}
}
timeo = expire_connections(efd, now);
}
return 0;
}
```
請補完程式碼。
==作答區==
EEE = ?
* `(a)` `num_events`
* `(b)` `num_events++`
* `(c)` `num_events--`
* `(d)` `++num_events`
* `(e)` `--num_events`
FFF = ?
* `(a)` 不需要加入程式碼
* `(b)` `buf->bytes += n`
* `(c)` `buf->bytes -= n`
* `(d)` `buf->bytes--`
* `(e)` `buf->bytes++`
:::success
延伸問題:
1. 解釋上述程式碼運作原理,指出實作缺陷並改進
2. 將 DNS 解析的機制納入,允許用 [FQDN](https://en.wikipedia.org/wiki/Fully_qualified_domain_name) 作為輸入
3. 嘗試克服連線數量的限制,設計實驗並探討如此 port forwarding 的效率
:::