Try   HackMD

2021q1 第 14 週測驗題: 測驗 2

tags: linux2021

測驗題目總覽

以下程式碼嘗試透過「高效 Web 伺服器開發」提到的 epoll 和「以 sendfile 和 splice 系統呼叫達到 Zero-Copy」提到的 splice 系統呼叫,實作出具體而微的 Port forwarding。考慮一個情境:我們對外有一台防火牆,在 DNS 設定方面,我們設定 ftp.mydomain.comwww.mydomain.com 都指向這台防火牆。但我們希望所有 HTTP 連線都重新導向到內部的 192.168.0.2 這台機器上,而所有 FTP 連線都交由 192.168.0.3 來處理。這時候我們就可以使用 port forwarding 的方式來達成。對應的 NAT (Network Address Translation) 的設定如下:

redirect_port tcp 192.168.0.2:80 80 redirect_port tcp 192.168.0.3:20 20 redirect_port tcp 192.168.0.3:21 21

第 1 行的目的就是將 port 80 的 tcp 連線重新導向到 192.168.0.2 的 port 80,而第 2 和第 3 行是將 port 20 及 port 21 的連線交由 192.168.0.3 來處理。在 192.168.0.2192.168.0.3 這二台機器上,我們只要設定它們的 gateway 為防火牆的 IP,例如 192.168.0.1 即可。

使用 splice 系統呼叫,我們有機會在網路介面控制器的支援下,達到 Zero-copy 資料傳輸。

原始程式碼可見 proxy.c,其 list.h 取自 list.h,改寫自 Linux 核心原始程式碼。

假設本地機器系統 port 8081 已有網頁伺服器在等待連線。proxy 的測試方式為

$ ./proxy 8082 localhost 8081

等程式執行後,在另一個終端機畫面中輸入下列命令:

$ telnet localhost 8082

接著你就可以輸入 HTTP 請求字串,如 GET /index.html

此外,你還可以把 port 8082 轉向到 Google 首頁:
先找出 www.google.com 的 IP 地址:

$ nslookup www.google.com

得到以下輸出:

Name:	www.google.com
Address: 172.217.27.132

修改上述命令:

./proxy 8082 172.217.27.132 80

重複上述 telnet 命令,這時候就會看到 Google 首頁的字串。

以下是 proxy.c 程式碼列表

/* Simple port forwarder
 * Uses pipes to splice two sockets together. This should give something
 * approaching zero copy, if the NIC driver is capable.
 * This method is rather file descriptor intensive (4 fds/connection), so
 * make sure you have enough.
 */

#define _GNU_SOURCE 1
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <netdb.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/epoll.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <time.h>
#include <unistd.h>
#include "list.h"

#define err(x) perror(x), exit(1)
#define NEW(x) ((x) = xmalloc(sizeof(*(x))))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

static int connection_timeout = 5; /* FIXME: configurable */

static void oom(void)
{
    fprintf(stderr, "Out of memory\n");
    exit(1);
}

void *xmalloc(size_t size)
{
    void *p = calloc(size, 1);
    if (!p) oom();
    return p;
}

void *xrealloc(void *old, size_t size)
{
    void *p = realloc(old, size);
    if (!p) oom();
    return p;
}

struct addrinfo *resolve(char *name, char *port, int flags)
{
    struct addrinfo *adr;
    struct addrinfo hint = {.ai_flags = flags};

    int ret = getaddrinfo(name, port, &hint, &adr);
    if (ret) {
        fprintf(stderr, "proxy: Cannot resolve %s %s: %s\n", name, port,
                gai_strerror(ret));
        exit(1);
    }
    return adr;
}

void setnonblock(int fd, int *cache)
{
    int flags;
    if (!cache || *cache == -1) {
        flags = fcntl(fd, F_GETFL, 0);
        if (cache) *cache = flags;
    } else
        flags = *cache;
    fcntl(fd, F_SETFL, flags | O_NONBLOCK);
}

struct buffer {
    int pipe[2];
    int bytes;
};

struct conn {
    struct conn *other;
    int fd;
    struct buffer buf;
    time_t expire;
    struct list_head expire_node;
};

LIST_HEAD(expire_list);

#define MIN_EVENTS 32
struct epoll_event *events;
int num_events, max_events;

int epoll_add(int efd, int fd, int revents, void *conn)
{
    struct epoll_event ev = {.events = revents, .data.ptr = conn};
    if (EEE >= max_events) {
        max_events = MAX(max_events * 2, MIN_EVENTS);
        events = xrealloc(events, sizeof(struct epoll_event) * max_events);
    }
    return epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev);
}

int epoll_del(int efd, int fd)
{
    num_events--;
    assert(num_events >= 0);
    return epoll_ctl(efd, EPOLL_CTL_DEL, fd, (void *) 1L);
}

/* Create buffer between two connections */
struct buffer *newbuffer(struct buffer *buf)
{
    if (pipe2(buf->pipe, O_NONBLOCK) < 0) {
        perror("pipe");
        return NULL;
    }
    return buf;
}

void delbuffer(struct buffer *buf)
{
    close(buf->pipe[0]);
    close(buf->pipe[1]);
}

void delconn(int efd, struct conn *conn)
{
    list_del(&conn->expire_node);
    delbuffer(&conn->buf);
    epoll_del(efd, conn->fd);
    close(conn->fd);
    free(conn);
}

struct conn *newconn(int efd, int fd, time_t now)
{
    struct conn *conn;
    NEW(conn);
    conn->fd = fd;
    if (!newbuffer(&conn->buf)) {
        delconn(efd, conn);
        return NULL;
    }
    if (epoll_add(efd, fd, EPOLLIN | EPOLLOUT | EPOLLET, conn) < 0) {
        perror("epoll");
        delconn(efd, conn);
        return NULL;
    }
    conn->expire = now + connection_timeout;
    list_add_tail(&conn->expire_node, &expire_list);
    return conn;
}

/* Process incoming connection. */
void new_request(int efd, int lfd, int *cache, time_t now)
{
    int newsk = accept(lfd, NULL, NULL);
    if (newsk < 0) {
        perror("accept");
        return;
    }
    setnonblock(newsk, cache);
    newconn(efd, newsk, now);
}

/* Open outgoing connection */
struct conn *openconn(int efd,
                      struct addrinfo *host,
                      int *cache,
                      struct conn *other,
                      time_t now)
{
    int outfd = socket(host->ai_family, SOCK_STREAM, 0);
    if (outfd < 0) return NULL;
    setnonblock(outfd, cache);
    int n = connect(outfd, host->ai_addr, host->ai_addrlen);
    if (n < 0 && errno != EINPROGRESS) {
        perror("connect");
        close(outfd);
        return NULL;
    }
    struct conn *conn = newconn(efd, outfd, now);
    if (conn) {
        conn->other = other;
        other->other = conn;
    }
    return conn;
}

#define BUFSZ 16384 /* FIXME: configurable */
/* Move from socket to pipe */
bool move_data_in(int srcfd, struct buffer *buf)
{
    for (;;) {
        int n = splice(srcfd, NULL, buf->pipe[1], NULL, BUFSZ,
                       SPLICE_F_NONBLOCK | SPLICE_F_MOVE);
        if (n > 0) buf->bytes += n;
        if (n == 0) return false;
        if (n < 0) {
            if (errno == EAGAIN || errno == EWOULDBLOCK) return true;
            return false;
        }
    }
    return true;
}

/* From pipe to socket */
bool move_data_out(struct buffer *buf, int dstfd)
{
    while (buf->bytes > 0) {
        int bytes = buf->bytes;
        if (bytes > BUFSZ) bytes = BUFSZ;
        int n = splice(buf->pipe[0], NULL, dstfd, NULL, bytes,
                       SPLICE_F_NONBLOCK | SPLICE_F_MOVE);
        if (n == 0) break;
        if (n < 0) {
            if (errno == EAGAIN || errno == EWOULDBLOCK) break;
            return false;
        }
        FFF;
    }
    /* bytes > 0, add dst to epoll set. Otherwise, remove if it was added */
    return true;
}

void closeconn(int efd, struct conn *conn)
{
    if (conn->other) delconn(efd, conn->other);
    delconn(efd, conn);
}

int expire_connections(int efd, time_t now)
{
    struct conn *conn, *tmp;

    list_for_each_entry_safe (conn, tmp, &expire_list, expire_node) {
        if (conn->expire > now) return (conn->expire - now) * 1000;
        closeconn(efd, conn);
    }
    return -1;
}

void touch_conn(struct conn *conn, time_t now)
{
    conn->expire = now + connection_timeout;
    list_del(&conn->expire_node);
    list_add_tail(&conn->expire_node, &expire_list);
}

int listen_socket(int efd, char *lname, char *port)
{
    struct addrinfo *laddr = resolve(lname, port, AI_PASSIVE);

    int lfd = socket(laddr->ai_family, SOCK_STREAM, 0);
    if (lfd < 0) err("socket");
    int opt = 1;
    if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(int)) < 0)
        err("SO_REUSEADDR");
    if (bind(lfd, laddr->ai_addr, laddr->ai_addrlen) < 0) err("bind");
    if (listen(lfd, 20) < 0) err("listen");
    setnonblock(lfd, NULL);
    freeaddrinfo(laddr);

    if (epoll_add(efd, lfd, EPOLLIN, NULL) < 0) err("epoll add listen fd");

    return lfd;
}

int main(int ac, char **av)
{
    if (ac != 4 && ac != 5) {
        fprintf(stderr, "Usage: proxy inport outhost outport [listenaddr]\n");
        exit(1);
    }

    struct addrinfo *outhost = resolve(av[2], av[3], 0);

    int efd = epoll_create(10);
    if (efd < 0) err("epoll_create");

    int lfd = listen_socket(efd, av[4] ? av[4] : "0.0.0.0", av[1]);

    int cache_in = -1, cache_out = -1;
    int timeo = -1;

    for (;;) {
        int nfds = epoll_wait(efd, events, num_events, timeo);
        if (nfds < 0) {
            perror("epoll");
            continue;
        }
        time_t now = time(NULL);

        for (int i = 0; i < nfds; i++) {
            struct epoll_event *ev = &events[i];
            struct conn *conn = ev->data.ptr;

            /* listen socket */
            if (!conn) {
                if (ev->events & EPOLLIN) new_request(efd, lfd, &cache_in, now);
                continue;
            }

            if (ev->events & (EPOLLERR | EPOLLHUP)) {
                closeconn(efd, conn);
                continue;
            }

            struct conn *other = conn->other;

            /* No attempt for partial close right now */
            if (ev->events & EPOLLIN) {
                touch_conn(conn, now);
                if (!other)
                    other = openconn(efd, outhost, &cache_out, conn, now);
                bool in = move_data_in(conn->fd, &conn->buf);
                bool out = move_data_out(&conn->buf, other->fd);
                if (!in || !out) {
                    closeconn(efd, conn);
                    continue;
                }
                touch_conn(other, now);
            }

            if ((ev->events & EPOLLOUT) && other) {
                if (!move_data_out(&other->buf, conn->fd))
                    delconn(efd, conn);
                else
                    touch_conn(conn, now);

                /* When the pipe filled up could have lost input events.
                 * Unfortunately, splice does not tell us which end was
                 * responsible for 0, so we have to ask explicitely.
                 */
                int len = 0;
                if (ioctl(other->fd, FIONREAD, &len) < 0) perror("ioctl");
                if (len > 0) {
                    if (!move_data_in(other->fd, &other->buf))
                        closeconn(efd, other);
                }
            }
        }

        timeo = expire_connections(efd, now);
    }
    return 0;
}

請補完程式碼。

作答區

EEE = ?

  • (a) num_events
  • (b) num_events++
  • (c) num_events--
  • (d) ++num_events
  • (e) --num_events

FFF = ?

  • (a) 不需要加入程式碼
  • (b) buf->bytes += n
  • (c) buf->bytes -= n
  • (d) buf->bytes--
  • (e) buf->bytes++

延伸問題:

  1. 解釋上述程式碼運作原理,指出實作缺陷並改進
  2. 將 DNS 解析的機制納入,允許用 FQDN 作為輸入
  3. 嘗試克服連線數量的限制,設計實驗並探討如此 port forwarding 的效率