Linux 核心專題: kHTTPd 改進

執行人: terry23304
專題解說錄影
 GitHub

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

提問清單

任務簡述

依據 ktcp 的指示，持續改進 sysprog21/khttpd 的程式碼，打造出高效且穩定的網頁伺服器。

TODO: 改進 sysprog21/khttpd 的效率

依據 ktcp 的指示，在 sysprog21/khttpd 的基礎之上，利用 CMWQ 一類的機制，打造出高效且穩定的網頁伺服器，需要處理 lock-free 的資源管理議題 (如 RCU)。

搭配閱讀: 〈RCU Usage In the Linux Kernel: One Decade Later〉

An application can change the IP options on a per-socket basis by calling sys_setsockopt,
which eventually causes the kernel to call setsockopt.
setsockopt sets the new IP options, then uses call_
rcu to asynchronously free the memory storing the old IP
options. Using call_rcu ensures all threads that might
be manipulating the old options will have released their
reference by exiting the RCU critical section.

TODO: 檢視 I/O 模型並尋求效能改進空間

以 ftrace 檢視 kHTTPd 運作的細節，檢視 I/O 模型並尋求效能改進空間

引入 Concurrency Managed Workqueue

參考 kecho 把所有 work 使用一個鏈結串列做管理，用 is_stopped 判斷是否結束連線

struct httpd_service {
    bool is_stopped;
    struct list_head head;
};
extern struct httpd_service daemon;

原本的 kthread-based 實作是每個 request 都會執行一次 kthread_run，改成 CMWQ 後改為呼叫 create_work 來建立 work，並把建立好的 work 放入 queue 中，server 結束時呼叫 free_work 釋放管理 work 的串列

int http_server_daemon(void *arg)
{
    ...
    INIT_LIST_HEAD(&daemon.head);

    while (!kthread_should_stop()) {
        ...
        if (unlikely(!(work = create_work(socket)))) {
            printk(KERN_ERR MODULE_NAME
                   ": create work error, connection closed\n");
            kernel_sock_shutdown(socket, SHUT_RDWR);
            sock_release(socket);
            continue;
        }
        queue_work(khttpd_wq, work);
    }
    daemon.is_stopped = true;
    free_work();

    return 0;
}

配置完空間後，呼叫 INIT_WORK 建立新的 work，並初始化再把 work 加進串列中，當 work 被執行時會呼叫 http_server_worker 來完成任務

static struct work_struct *create_work(struct socket *sk)
{
    struct http_request *work;

    if (!(work = kmalloc(sizeof(struct http_request), GFP_KERNEL)))
        return NULL;

    work->socket = sk;

    INIT_WORK(&work->khttpd_work, http_server_worker);

    list_add(&work->node, &daemon.head);

    return &work->khttpd_work;
}

執行命令 ./htstress http://localhost:8081 -t 3 -c 20 -n 200000 測試

requests:      200000
good requests: 200000 [100%]
bad requests:  0 [0%]
socket errors: 0 [0%]
seconds:       4.972
requests/sec:  40223.732

CMWQ:
seconds:       3.366
requests/sec:  59425.669

	kthread	CMWQ
throughput	40223.732	59425.669

實作 Directory listing

使用 linux/fs.h 中的 dir_context、filp_open、iterate_dir 函式

呼叫 http_server_send 送出 header 與 html，若目錄無法開啟則回傳 404，若可以則回傳 200 OK ，在執行 tracedir 時把目錄裡每個檔案寫在 html table 中並傳送到 client 讓網頁顯示所有的檔案

static void directory_listing(struct http_request *request)
{
    struct file *fp;
    char buf[SEND_BUFFER_SIZE] = {0};
    char request_url[REQUEST_URL_SIZE] = {0};

    request->dir_context.actor = tracedir;

    snprintf(request_url, REQUEST_URL_SIZE, "%s%s", DIR, request->request_url);
    fp = filp_open(request_url, O_RDONLY, 0);
    pr_info("request_url: %s\n", request_url);

    if (IS_ERR(fp)) {
        pr_info("Open file failed");
        http_server_send(request->socket, HTTP_RESPONSE_404,
                         strlen(HTTP_RESPONSE_404));
        return;
    }

    snprintf(buf, SEND_BUFFER_SIZE, HTTP_RESPONSE_200_KEEPALIVE_DUMMY);
    http_server_send(request->socket, buf, strlen(buf));
    snprintf(buf, SEND_BUFFER_SIZE, "%s%s%s%s", "<html><head><style>\r\n",
             "body{font-family: monospace; font-size: 15px;}\r\n",
             "td {padding: 1.5px 6px;}\r\n",
             "</style></head><body><table>\r\n");
    http_server_send(request->socket, buf, strlen(buf));

    iterate_dir(fp, &request->dir_context);
    snprintf(buf, SEND_BUFFER_SIZE, "</table></body></html>\r\n");
    http_server_send(request->socket, buf, strlen(buf));
    filp_close(fp, NULL);

    return;
}

傳送每個檔案的檔名與檔案或子目錄的連結，並用 html 的 table 包起來，並發送到 request->socket

static int tracedir(struct dir_context *dir_context,
                    const char *name,
                    int namelen,
                    loff_t offset,
                    u64 ino,
                    unsigned int d_type)
{
    if (strcmp(name, ".")) {
        struct http_request *request =
            container_of(dir_context, struct http_request, dir_context);
        char buf[SEND_BUFFER_SIZE] = {0};

        snprintf(buf, SEND_BUFFER_SIZE,
                 "<tr><td><a href=\"%s\">%s</a></td></tr>\r\n", name, name);
        http_server_send(request->socket, buf, strlen(buf));
    }

    return 0;
}

結果:

Image Not Showing Possible Reasons

The image was uploaded to a note which you don't have access to
The note which the image was originally uploaded to has been deleted

Learn More →

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

修正上圖的存取權限。

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

讀取檔案並顯示

原本的實作只會顯示目錄中的所有檔案，新增判斷開啟的是目錄或檔案來回傳檔案內容

static void directory_listing(struct http_request *request)
{
    ...
    if (S_ISDIR(fp->f_inode->i_mode)) {
        ...
    } else if (S_ISREG(fp->f_inode->i_mode)) {
        http_server_send(request->socket, buf, strlen(buf));

        char *file_content = kmalloc(fp->f_inode->i_size, GFP_KERNEL);
        if (!file_content) {
            pr_info("malloc failed");
            filp_close(fp, NULL);
            return;
        }

        int ret = kernel_read(fp, file_content, fp->f_inode->i_size, 0);
        http_server_send(request->socket, file_content, ret);
        kfree(file_content);
    }

    filp_close(fp, NULL);
    return;
}

使用 MIME 判斷副檔名

閱讀 include/linux/hashtable.h

把 MIME type 加入 hashtable 中做查詢

hashtable 結構:

struct mime_map_entry {
    const char *extension;
    const char *mime_type;
    struct hlist_node node;
};

把 MIME type 跟對應的副檔名加入 hashtable 中，在 moudle 初始化時呼叫

void init_mime_map_table(void)
{
    struct mime_map_entry *entry;

    hash_init(mime_map_table);

    for (entry = mime_map; entry->extension != NULL; entry++) {
        hash_add(mime_map_table, &entry->node, mime_hash(entry->extension));
    }
}

查詢 hashtable 得到 MIME type

const char *get_mime_type(const char *extension)
{
    if (!extension)
        return "text/plain";

    struct mime_map_entry *entry;
    struct hlist_node *node;

    hash_for_each_possible(mime_map_table, entry, node, mime_hash(extension))
    {
        if (!strcmp(entry->extension, extension))
            return entry->mime_type;
    }

    return "text/plain";
}

根據不同副檔名傳送不同 content-type

static void directory_listing(struct http_request *request)
{
    ...
    if (S_ISDIR(fp->f_inode->i_mode)) {
        ...
    } else if (S_ISREG(fp->f_inode->i_mode)) {
+        const char *extension = strchr(request->request_url, '.');
+        snprintf(buf, SEND_BUFFER_SIZE, "%s%s%s%s", "HTTP/1.1 200 OK\r\n",
+                 "Content-Type: ", get_mime_type(extension),
+                 "\r\nConnection: Keep-Alive\r\n\r\n");
        http_server_send(request->socket, buf, strlen(buf));
        ...
        int ret = kernel_read(fp, file_content, fp->f_inode->i_size, 0);
        http_server_send(request->socket, file_content, ret);
        kfree(file_content);
    }

    filp_close(fp, NULL);
    return;
}

可以開啟並顯示圖片

利用 Ftrace 找出 khttpd 核心模組的效能瓶頸

  0)   0.130 us    |        http_should_keep_alive [khttpd]();
  0)               |        directory_listing [khttpd]() {
  0)   8.311 us    |          filp_open();
  0)   4.862 us    |          _printk();
  0) + 24.243 us   |          http_server_send.isra.0 [khttpd]();
  0) + 21.330 us   |          http_server_send.isra.0 [khttpd]();
  0) ! 815.615 us  |          iterate_dir();
  0) + 17.077 us   |          http_server_send.isra.0 [khttpd]();
  0)   0.732 us    |          filp_close();
  0) ! 893.672 us  |        }
  0) ! 894.180 us  |      }
  0) ! 897.131 us  |    }

可以發現 iterate_dir() 花最多時間，其次是 http_server_send()，參考 Jerejere0808 的開發紀錄使用 content cache 來加快 iterate_dir() 的執行時間

實作 content cache 改進處理效率

為了避免每次請求時都執行 iterate_dir() 或 kernel_read()，可以將回應快取下來，以便在下次有相同的請求時不需要再次讀取。
使用 kernel 的 hashtable API 來實作快取，並且由於檔案系統符合資料讀取頻繁，資料寫入較少的情況，加入 Read-Copy Update (RCU) 同步機制，以支援多個讀取端同時存取快取的需求。

快取項目結構:

struct content_cache_entry {
    const char *request_url;
    const char *response;
    struct hlist_node node;
    spinlock_t lock;
};

每次 request 先根據 request->url 查詢 hashtable，若有就直接傳送，若無則跟原本一樣判斷 request 是檔案還是目錄，讀取完後再插入到 hashtable 中

static void directory_listing(struct http_request *request)
{
    ...
    // get cache content
    if (strlen(request->cache_buffer) != 0) {
        http_server_send(request->socket, request->cache_buffer,
                         strlen(request->cache_buffer));
        return;
    }

    ...
    // If not found in the cache, then insert the content into cache.
    insert_content_cache(request_url, request->cache_buffer);

    filp_close(fp, NULL);
    return;
}

實作 timer 定期釋放 cache content

參考 sehttpd、作業說明使用 min heap 實作 timer

timer 與 queue 的結構

typedef struct {
    size_t key;
    size_t pos;
    struct hlist_node hash_node;
} timer_node;

typedef struct {
    void **priv;
    atomic_t nalloc;
    atomic_t size;
    prio_queue_comparator comp;
} prio_queue_t;

在將 response content 插入 cache 時呼叫 cache_add_timer 把 timer 插入 queue 中

void insert_content_cache(char *request_url, char *cache_buffer)
{
    ...
    cache_add_timer(entry, TIMEOUT_DEFAULT);

    spin_lock_init(&entry->lock);  
    spin_lock(&entry->lock);
    hash_add_rcu(content_cache_table, &entry->node, request_hash(request_url));
    spin_unlock(&entry->lock);
}

設定好 timer_node 後呼叫 prio_queue_insert() 確認節點在 min heap 中的位置

void cache_add_timer(struct content_cache_entry *entry, size_t timeout)
{
    timer_node *node = kmalloc(sizeof(timer_node), GFP_KERNEL);
    if (!node)
        return;

    time_update();
    entry->timer = node;
    node->key = atomic_read(&current_msec) + timeout;
    node->pos = atomic_read(&timer.nalloc) + 1;
    node->hash_node = entry->node;

    prio_queue_insert(&timer, node);
}

在每次 request 檢查有沒有 timer expired，因為是 min heap 所以只要檢查 heap 中的第一個即可

void handle_expired_timers()
{
    while (!prio_queue_is_empty(&timer)) {
        time_update();
        timer_node *node = prio_queue_min(&timer);

        if (node->key > atomic_read(&current_msec))
            return;
        prio_queue_delmin(&timer);
    }
}

用 do-while 來確保沒有被 min heap 沒有被同時修改，並把第一個節點跟最後一個有效的節點互換，交換完成之後把有效節點 nalloc 減一，再呼叫 prio_queue_sink 把被交換的節點下沉到正確的位置，最後再呼叫 hash_del_rcu 來把快取的內容移除，移除完之後呼叫 synchronize_rcu 來同步讀者

static bool prio_queue_delmin(prio_queue_t *ptr)
{
    ...
    do {
        if (prio_queue_is_empty(ptr))
            return true;

        nalloc = atomic_read(&ptr->nalloc);
        prio_queue_swap(ptr, 1, nalloc);

        if (nalloc == atomic_read(&ptr->nalloc)) {
            node = ptr->priv[nalloc];
            break;
        }
        prio_queue_swap(ptr, 1, nalloc);
    } while (1);

    atomic_dec(&ptr->nalloc);
    prio_queue_sink(ptr, 1);

    hash_del_rcu(&node->hash_node);
    synchronize_rcu();

    return true;
}

並在有相同 request 時延長 timer 的時間

void cache_timer_update(timer_node *node, size_t timeout)
{
    time_update();
    node->key = atomic_read(&current_msec) + timeout;
    node->pos = prio_queue_sink(&timer, node->pos);
}

執行 ./htstress http://localhost:8081 -n 20000 測試

requests:      20000
good requests: 20000 [100%]
bad requests:  0 [0%]
socket errors: 0 [0%]
seconds:       1.096
requests/sec:  18254.904

因為兩萬次 request 都是相同的，所以只有第一次需要讀檔，requests/sec 從 7474.638 提升到 18254.904