Kswapd trace

tags: LINUX KERNEL

Following based on Linux v4.14

每個 NUMA node 會有一個 kswapd process: [kswapd0], [kswapd1]

node

/*
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout. On UMA machines there is a single pglist_data which
 * describes the whole memory.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */
struct bootmem_data;
typedef struct pglist_data {

        /* kswapd related field */
        wait_queue_head_t kswapd_wait;  // kswapd waiting list
        wait_queue_head_t pfmemalloc_wait; // direct reclaim waiting list
        struct task_struct *kswapd; // point to kswapd process's task_struct
        int kswapd_order;     
        enum zone_type kswapd_classzone_idx; // kswapd 該掃描到哪一個 zone
        int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
};

初始化

設置 callback function kswapd()

int kswapd_run(int nid)
{
	pg_data_t *pgdat = NODE_DATA(nid);
	int ret = 0;

	if (pgdat->kswapd)
		return 0;

	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
	if (IS_ERR(pgdat->kswapd)) {
		/* failure at boot is fatal */
		BUG_ON(system_state < SYSTEM_RUNNING);
		pr_err("Failed to start kswapd on node %d\n", nid);
		ret = PTR_ERR(pgdat->kswapd);
		pgdat->kswapd = NULL;
	}
	return ret;
}

static int __init kswapd_init(void)
{
	int nid, ret;

	swap_setup();
	for_each_node_state(nid, N_MEMORY)
 		kswapd_run(nid);
	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
					"mm/vmscan:online", kswapd_cpu_online,
					NULL);
	WARN_ON(ret < 0);
	return 0;
}

喚醒

kswapd 雖然在系統啟動時就創建,只有等到由於記憶體不足而分配內存失敗時才會觸發

__alloc_pages_slowpath()
    wake_all_kswapds()
        wakeup_kswapd()
            pgdat_balanced()
            
/*
 * Returns true if there is an eligible zone balanced for the request order
 * and classzone_idx
 */
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) {
    
}

/*
 * A zone is low on free memory, so wake its kswapd task to service it.
 */
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
    ...

    pgdat = zone->zone_pgdat;
    curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);


    // update pgdat->kswapd_classzone_idx, pgdat->kswapd_order
    ...
        
    if (!waitqueue_active(&pgdat->kswapd_wait))
        return;

    /* Hopeless node, leave it to direct reclaim */
    if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
        return;

    // 至少有一個 zone 在 high wmark 的條件下有滿足此 order 的頁面
    if (pgdat_balanced(pgdat, order, classzone_idx))
        return;

    trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
    // 喚醒 kswapd_wait 對列上的 kswapd process
    wake_up_interruptible(&pgdat->kswapd_wait);
}

static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
{
	struct zoneref *z;
	struct zone *zone;
	pg_data_t *last_pgdat = NULL;

        // 遍歷每個 zone, 反查 zone 對應的 node 來啟動對應的 kswapd process
	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
					ac->high_zoneidx, ac->nodemask) {
		if (last_pgdat != zone->zone_pgdat)
			wakeup_kswapd(zone, order, ac->high_zoneidx);
		last_pgdat = zone->zone_pgdat;
	}
}

回收

kswapd()
    kswapd_try_to_sleep()
    balance_pgdat()
        pgdat_balanced() 當所有的 zone 都無法滿足需求才要回收
        age_active_anon() 為甚麼只 age anon ?
            shrink_active_list()
        mem_cgroup_soft_limit_reclaim // soft limit reclaim first
        kswapd_shrink_node()
            shrink_node()
                shrink_node_memcg()
                    get_scan_count()
                    for_each_evictable_lru() {
                        shrink_list()
                            if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
                                shrink_active_list()
                            shrink_inactive_list()
                                shrink_page_list()
                                    page_check_references()
                                        page_referenced()
                                            rmap_walk()
                    }
                    // 為甚麼只 shrink active anon
                    if (inactive_list_is_low(lruvec, false, sc, true))
		            shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON);
                shrink_slab()
                vmpressure()

struct scan_control 記錄回收狀態

struct scan_control {
    /* How many pages shrink_list() should reclaim */
    unsigned long nr_to_reclaim; 
    /* This context's GFP mask */
    gfp_t gfp_mask; 
    /* Allocation order */
    int order; 
    /*
     * Nodemask of nodes allowed by the caller. If NULL, all nodes
     * are scanned.
     */
    nodemask_t	*nodemask;
    /* 是否針對某個 cgroup */
    struct mem_cgroup *target_mem_cgroup;
    /* Scan (total_size >> priority) pages at once */
    int priority;
    /* The highest zone to isolate pages for reclaim from */
    enum zone_type reclaim_idx;
    
    unsigned int may_writepage:1;
    unsigned int may_unmap:1;
    unsigned int may_swap:1;
    
    /* One of the zones is ready for compaction */
    unsigned int compaction_ready:1;
    /* 已掃描的 inactive pages */
    unsigned long nr_scanned;
    /* shrink_zones() 中回收的頁面數 */
    unsigned long nr_reclaimed;
};

what is gfp_

https://lwn.net/Articles/23042/

gfp = get free pages

Allocation flags 包含兩個部分

  • memory zone flags: 預設就是 normal (沒有 __GFP_NORMAL)
  • allocation flags
    • __GFP_IO: 內存分配過程中可以執行 IO 操作
    • __GFP_FS: 內存分配過程中可以執行 VFS 操作
    • __GFP_DIRECT_RECLAIM
    • __GFP_KSWAPD_RECLAIM
    • __GFP_RECLAIM
// include/linux/gfp.h
...
#define __GFP_IO	((__force gfp_t)___GFP_IO)
#define __GFP_FS	((__force gfp_t)___GFP_FS)
#define __GFP_DIRECT_RECLAIM	((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */
#define __GFP_KSWAPD_RECLAIM	((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */
#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM))
...
/*
 * Useful GFP flag combinations that are commonly used. It is recommended
 * that subsystems start with one of these combinations and then set/clear
 * __GFP_FOO flags as necessary.
...
 * GFP_KERNEL is typical for kernel-internal allocations. The caller requires
 *   ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
...
*/
#define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)

balance_pgdat

static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) 
    ...
    struct scan_control sc = {
        .gfp_mask = GFP_KERNEL,
        .order = order,
        .priority = DEF_PRIORITY, // 12
        .may_writepage = !laptop_mode, // laptop mode ?
        .may_unmap = 1,
        .may_swap = 1,
    };
    psi_memstall_enter(&pflags);
    count_vm_event(PAGEOUTRUN);

    do {
        ...
        // If the number of buffer_head exceed the maximum allowed then consider reclaiming from all zones.
        if (buffer_heads_over_limit) {
            ...
        }
        
        // 當所有的 zone 都無法滿足需求才要回收
        if (pgdat_balanced(pgdat, sc.order, classzone_idx))
            goto out;
        
        // Do some background aging of the anon list
        age_active_anon(pgdat, &sc);
        
        // 如果回收沒有很順利: priority 太低, 則強制 enable may_writepage
        if (sc.priority < DEF_PRIORITY - 2)
            sc.may_writepage = 1;
    
        // soft limit reclaim first
        ...
        mem_cgroup_soft_limit_reclaim()
        
        if (kswapd_shrink_node(pgdat, &sc))
            raise_priority = false;
        
        ...
    
        // Raise priority if scanning rate is too low
        ...
    } while (sc.priority >= 1);
}

kswapd_shrink_node

static bool kswapd_shrink_node(pg_data_t *pgdat,
			       struct scan_control *sc)
{
    /* Reclaim a number of pages proportional to the number of zones */
    sc->nr_to_reclaim = 0;
    for (z = 0; z <= sc->reclaim_idx; z++) {
        zone = pgdat->node_zones + z;
        if (!managed_zone(zone))
            continue;

        sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
    }
    
    shrink_node(pgdat, sc);
    
    ...
        
    return sc->nr_scanned >= sc->nr_to_reclaim;
}

shrink_node

走過所有 memory cgroup

  1. 回收 LRU list
  2. call shrinker, 回收 slab
static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) 
{
    ...
    do {
        struct mem_cgroup *root = sc->target_mem_cgroup;
        struct mem_cgroup_reclaim_cookie reclaim = {
            .pgdat = pgdat,
            .priority = sc->priority,
        };
        unsigned long node_lru_pages = 0;
        struct mem_cgroup *memcg;
        
        nr_reclaimed = sc->nr_reclaimed;
        nr_scanned = sc->nr_scanned;
        
        memcg = mem_cgroup_iter(root, NULL, &reclaim);
        do {
            ...
            shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
            ...
            if (memcg)
                shrink_slab(sc->gfp_mask, pgdat->node_id,
					    memcg, sc->priority);
        } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
        
    } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
					 sc->nr_scanned - nr_scanned, sc));
    
    return reclaimable;
}

shrink_node_memcg

核心回收函式, kswapd 和 direct reclaim 最後都會呼叫到這裡

  1. get_scan_count 計算應該從每個 lru list 掃描/回收多少 page
    1. 如果沒有 swap space -> SCAN_FILE
    2. 如果在 memory cgroup 內則完全尊重 swappiness, if swappiness = 0 -> SCAN_FILE
    3. 當 priority = 0 代表壓力很大了,因此兩個都做回收,除非 swapiness = 0 -> SCAN_EQUAL
      • priority 會從 12 開始,每次掃描不到要求的 page 就會遞減,priority = 0 代表掃描整個 list 都回收不到要求的數量,那這時候最好兩種 page 都做回收
    4. file page 已經很少了 (file + free < High WMARK) 且 inactive anon 有足夠的 page 回收 -> SCAN_ANON
      ​​​​​​​​if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
      ​​​​​​​​    if (!inactive_list_is_low(lruvec, false, sc, false) &&
      ​​​​​​​​        lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) >> sc->priority) {
      ​​​​​​​​        scan_balance = SCAN_ANON;
      ​​​​​​​​        goto out;
      ​​​​​​​​    }
      ​​​​​​​​}
      
    5. inactive page cache 很足夠 -> SCAN_FILE
      ​​​​​if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) &&
      ​​​​​​​​    !inactive_list_is_low(lruvec, true, sc, false) &&
      ​​​​​​​​    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
      ​​​​​​​​    scan_balance = SCAN_FILE;
      ​​​​​​​​    goto out;
      ​​​​​​​​}
      
    6. 最後才是參考 swappiness -> SCAN_FRACT
      ​​​​​​​​anon_prio = swappiness;
      ​​​​​file_prio = 200 - anon_prio;
      
      ​​​​​​​​anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
      ​​​​​​​​    lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
      ​​​​​​​​file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
      ​​​​​​​​    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
      
      ​​​​​​​​// 定期縮小 reclaim_stat 的值來讓最近增加的值有更大的影響力
      ​​​​​​​​if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
      ​​​​​​​​    reclaim_stat->recent_scanned[0] /= 2;
      ​​​​​​​​    reclaim_stat->recent_rotated[0] /= 2;
      ​​​​​​​​}
      
      ​​​​​​​​if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
      ​​​​​​​​    reclaim_stat->recent_scanned[1] /= 2;
      ​​​​​​​​    reclaim_stat->recent_rotated[1] /= 2;
      ​​​​​​​​}
      
      ap=ap(recent_scanned+1)/(recent_rotated+1)

      fp=fp(recent_scanned+1)/(recent_rotated+1)

      recent_rotated 越多代表越有價值
    • 各符號算法
      • SCAN_EQUAL: 直接掃描 list 總數 >> priority
      • SCAN_FRACT: active/inactive file = fp, active/inactive anon = ap
      ​​​​​​​​/*
      ​​​​​​​​ * Scan types proportional to swappiness and
      ​​​​​​​​ * their relative recent reclaim efficiency.
      ​​​​​​​​ * Make sure we don't miss the last page on
      ​​​​​​​​ * the offlined memory cgroups because of a
      ​​​​​​​​ * round-off error.
      ​​​​​​​​ */
      
      • SCAN_ANON: anon 掃描 list 總數 >> priority
      • SCAN_FILE: file 掃描 list 總數 >> priority
  2. 算完每個 list 要回收多少 page 後就是實際的回收,每個 list 每輪最多回收 32(SWAP_CLUSTER_MAX) 個 page,進到 shrink_list
    ​​​​static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
    ​​​​                 struct lruvec *lruvec, struct scan_control *sc)
    ​​​​{
    ​​​​    if (is_active_lru(lru)) {
    ​​​​        if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
    ​​​​            shrink_active_list(nr_to_scan, lruvec, sc, lru);
    ​​​​        return 0;
    ​​​​    }
    
    ​​​​    return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
    ​​​​}
    
    • 可以發現雖然原本希望從每個 list 回收一定數量的 page, 但如果 inactive list 的數量還不少的話會將要掃描 active list 的部分移到 inactive list 上
static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
			      struct scan_control *sc, unsigned long *lru_pages)
{
    struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
    unsigned long nr[NR_LRU_LISTS];
    unsigned long targets[NR_LRU_LISTS];
    unsigned long nr_to_scan;
    enum lru_list lru;
    unsigned long nr_reclaimed = 0;
    unsigned long nr_to_reclaim = sc->nr_to_reclaim;
    struct blk_plug plug;
    bool scan_adjusted;
    
    get_scan_count(lruvec, memcg, sc, nr, lru_pages);
    ...
    while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
					nr[LRU_INACTIVE_FILE]) {
        unsigned long nr_anon, nr_file, percentage;
        unsigned long nr_scanned;

        for_each_evictable_lru(lru) {
            if (nr[lru]) {
                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
                nr[lru] -= nr_to_scan;

                nr_reclaimed += shrink_list(lru, nr_to_scan,
                                lruvec, sc);
            }
        }
        
        if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
            continue;
    
        根據結果調整 nr        
    }
    
    if (inactive_list_is_low(lruvec, false, sc, true))
        shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON);
}
  1. while 迴圈判斷可以發現 shrink_node_memcg 只要求 inactive_anon 和 file list 回收到指定數量,並沒有限制 active_anon
  2. 每輪對每個 list scan 至多 32 個 page
  3. 為了讓 anon 和 file list 有符合 get_scan_count() 計算出來的比例,在確保已經回收到 nr_to_reclaim 的數量後調整 nr
    • 把回收效率較好的 nr 設為 0

inactive_list_is_low

上述中有一個蠻重要的函式 inactive_list_is_low 用來判斷 inactive list 是否足夠,如果判斷為 True 則會 shrink active list

程式碼中提到

  1. inactive list 應該足夠小,讓系統不需要花太多時間掃描一堆 page,同時讓大部分的 page 留在 active list 以建立 workingset,在 active list 中的 page 被掃描到的機率比較低
  2. inactive list 也不能太小,以避免 thrashing,同時讓 page 有足夠的時間被提到 active list

kernel 用 refault ratio 來決定 inactive list 的大小。

https://zhuanlan.zhihu.com/p/421298579
当不断有新的文件页产生,并被添加在inactive file lru head,同时kernel不断进行inactive file lru tail回收,如果产生的新的文件页足够多使inactive lru足够长,且一直满足kernel回收量,这样就会导致shrink active lru一直不被触发,极端情况下,active lru会被孤立,即使active lru中存在很多非常老的page,它也没有机会被换到inactive lru中。Workingset_activate便可以统计和改善这种情况

static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
				 struct scan_control *sc, bool trace)
{
    ...
    inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
    active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
    
    /*
     * When refaults are being observed, it means a new workingset
     * is being established. Disable active list protection to get
     * rid of the stale workingset quickly.
     */
    refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
    if (file && lruvec->refaults != refaults) {
        inactive_ratio = 0;
    } else {
        gb = (inactive + active) >> (30 - PAGE_SHIFT);
        if (gb)
            inactive_ratio = int_sqrt(10 * gb);
        else
            inactive_ratio = 1;
    }
    ...
    return inactive * inactive_ratio < active;
}

shrink_inactive_list

static noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
		     struct scan_control *sc, enum lru_list lru) 
{
    ...
    ---
    nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
				         &nr_scanned, sc, isolate_mode, lru);
    __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
    reclaim_stat->recent_scanned[file] += nr_taken;
    
    pgscan += nr_scanned
    ---
        
    nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
                &stat, false);

    ...
        
    putback_inactive_pages(lruvec, &page_list);

    __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
    
    ...
}

shrink_page_list 中透過 page_check_references 來判斷 page 該去哪

  • PAGEREF_ACTIVATE: inactive -> active
    • anon page & shared page & executable file page
    • file page
  • PAGEREF_KEEP: inactive -> inactive
  • PAGEREF_RECLAIM_CLEAN
  • PAGEREF_RECLAIM

shrink_active_list

active -> inactive

  • 如果 exeutable file page 被掃描到時有被 referenced 則帶回 active
static void shrink_active_list(unsigned long nr_to_scan,
			       struct lruvec *lruvec,
			       struct scan_control *sc,
			       enum lru_list lru)
{
    ...
    ---
    nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
                     &nr_scanned, sc, isolate_mode, lru);
    __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
	reclaim_stat->recent_scanned[file] += nr_taken;
    
    pgrefill += nr_scanned
    ---
        
    while (!list_empty(&l_hold)) {
        ...
        if (page_referenced(page, 0, sc->target_mem_cgroup,
                    &vm_flags)) {
            nr_rotated += hpage_nr_pages(page);
            // 這邊只將 executable file page 帶回 active list
            if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
                list_add(&page->lru, &l_active);
                continue;
            }
        }
        ClearPageActive(page);	/* we are de-activating */
        SetPageWorkingset(page);
        list_add(&page->lru, &l_inactive);
    }
}

Move LRU page reclaim from zones to nodes

本來 LRU list 是每個 zone 一個

HIGHMEM zone

High memory (highmem) is used when the size of physical memory approaches or exceeds the maximum size of virtual memory. At that point it becomes impossible for the kernel to keep all of the available physical memory mapped at all times. This means the kernel needs to start using temporary mappings of the pieces of physical memory that it wants to access.

如果是 64 位元系統,要使用 kernel address 直接 MASK 0xffff000000000000 (phy -> virt) 即可,因為物理記憶體一定小於 0xffff000000000000,而 32 位元系統要 MASK 0xC0000000 如果我們的 RAM 有 4G,則後面 3G 沒辦法訪問 (MASK 後炸掉)

kernel 做 direct mapping

page 是如何被放到 LRU list 的

/**
 * lru_cache_add - add a page to a page list
 * @page: the page to be added to the LRU.
 *
 * Queue the page for addition to the LRU via pagevec. The decision on whether
 * to add the page to the [in]active [file|anon] list is deferred until the
 * pagevec is drained. This gives a chance for the caller of lru_cache_add()
 * have the page added to the active list using mark_page_accessed().
 */
void lru_cache_add(struct page *page)
{
	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
	VM_BUG_ON_PAGE(PageLRU(page), page);
	__lru_cache_add(page);
}

https://blog.csdn.net/weixin_42730667/article/details/123438959
page fault path

handle_mm_fault()
    __handle_mm_fault()
        handle_pte_fault()
            if pte 不存在
                do_anonymous_page()
                    lru_cache_add_active_or_unevictable()
                        SetPageActive(page);
                        lru_cache_add()
                do_fault()
                    do_read_fault()
                        vma->vm_ops->map_pages(filemap_map_pages())
                        vma->vm_ops->fault(filemap_fault())
                            find_get_page()
                            pagecache_get_page()
                                add_to_page_cache_lru()
                                    lru_cache_add()
                            mapping->a_ops->readpage(generic_file_buffered_read())
                                add_to_page_cache_lru()
                                    lru_cache_add()
                            

for anon page

mm/swap.c
/**
 * lru_cache_add_active_or_unevictable
 * @page:  the page to be added to LRU
 * @vma:   vma in which page is mapped for determining reclaimability
 *
 * Place @page on the active or unevictable LRU list, depending on its
 * evictability.  Note that if the page is not evictable, it goes
 * directly back onto it's zone's unevictable list, it does NOT use a
 * per cpu pagevec.
 */
void lru_cache_add_active_or_unevictable(struct page *page,
					 struct vm_area_struct *vma)
{
    VM_BUG_ON_PAGE(PageLRU(page), page);

    if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
        SetPageActive(page);
        lru_cache_add(page);
        return;
    }
    ...
}

for file page

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
				pgoff_t offset, gfp_t gfp_mask)
{
    ...
        WARN_ON_ONCE(PageActive(page));
        if (!(gfp_mask & __GFP_WRITE) && shadow)
            workingset_refault(page, shadow);
        lru_cache_add(page);
    ...
}

對於 anon page 在加到 lru list 前會先 SetPageActive, file page 則不會

https://lwn.net/Articles/815342/ base on v5.5
anon page 帶入時改為放到 inactive list, 但這有個缺點,如果 inactive list 太短則大部分的 page 來不及被提到 active list 就會被換出。

The kernel handles this case by putting newly faulted, file-backed pages directly onto the inactive list; they will only move to the active list if they are accessed again before being reclaimed.

active to inactive

LRU cache

如果我們每次都一個一個 page 加入 active/inactive list 那麼這兩個 list 鎖的競爭會非常嚴重,因此 kernel 有維護一個 per-CPU 的 lru cache,每當這些 lru cache 累積到一定數量再去獲取鎖然後放入

struct pagevec {
	unsigned long nr;
	unsigned long cold;
	struct page *pages[PAGEVEC_SIZE];
};

總共有6個 per-CPU cache,對應不同情況有不同的 cache

  • 一個全新的 page 加入 LRU list, 像是 page fault 發生時
    • lru_add_pvec __pagevec_lru_add_fn
  • inactive lru > inactive lru tail, 當系統完成某個 dirty page 的 writeback, 並且該 page 已經有嘗試要被回收過,則直接移到尾端
    • lru_rotate_pvecs pagevec_move_tail
    • 沒有 update_page_reclaim_stat()
  • active lru > inactive lru
    • lru_deactivate_file_pvecs lru_deactivate_file_fn, 對於 file page 當有外力主動釋放時, i.e. drop_caches
    • lru_deactivate_pvecs lru_deactivate_fn, deactivate a page
    • lru_lazyfree_pvecs lru_lazyfree_fn, make an anon page lazyfree 加速這個 page 的回收
  • inactive lru > active lru
    • activate_page_pvecs __activate_page
// 根據 page 的 flags 判斷應該加入哪一個 lru list
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, void *arg)
{
    int file = page_is_file_cache(page);
    int active = PageActive(page);
    enum lru_list lru = page_lru(page);

    VM_BUG_ON_PAGE(PageLRU(page), page);

    SetPageLRU(page);
    add_page_to_lru_list(page, lruvec, lru);
    update_page_reclaim_stat(lruvec, file, active);
    trace_mm_lru_insertion(page, lru);
}

/*
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
void __pagevec_lru_add(struct pagevec *pvec)
{
    pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
}
EXPORT_SYMBOL(__pagevec_lru_add);

lru_cache_add()
    __lru_cache_add()
        __pagevec_lru_add()

lru_add_drain_cpu()
    __pagevec_lru_add()

Refault

https://lwn.net/Articles/495543/

workingset_refault 0
workingset_activate 0
workingset_restore 0
workingset_nodereclaim 0

目前 kernel 只有針對 file page 做

The other change addresses the fact that refault tracking, in current kernels, is only done for the file-backed LRU list. Once an anonymous page is reclaimed, the kernel forgets about its history. As it turns out, the previous change (faulting pages into the inactive list) can exacerbate some bad behavior: adding new pages to the inactive list can quickly push out other pages that were just faulted in before they can be accessed a second time and promoted to the active list. If refault tracking were done for the anonymous LRU list, this situation could be detected and dealt with

As a general rule, refaults indicate thrashing, which is not a good thing. The kernel can respond to excessive refaulting by, for example, making the active list larger.

Workingset_refault

可能有些 page 頻繁被帶入又被換出,造成 thrashing (因為 file page 一開始會先被帶進 inactive list,所以這個問題在 file page 比較嚴重?)

當一個 file page 被帶入則判斷 shadow 存不存在,如果存在則直接帶入 active list

把 active file list 當作 workingset 大小,如果 refault distance > workingset 大小則不做任何事,因為也放不下

add_to_page_cache_lru()
    ...
    ret = __add_to_page_cache_locked(page, mapping, offset,
					 gfp_mask, &shadow);
    ...
    if (!(gfp_mask & __GFP_WRITE) && shadow)
        workingset_refault(page, shadow);
    lru_cache_add(page);

void workingset_refault(struct page *page, void *shadow) {
    ...
    refault_distance = (refault - eviction) & EVICTION_MASK;

    inc_lruvec_state(lruvec, WORKINGSET_REFAULT);

    if (refault_distance > active_file)
        goto out;

    SetPageActive(page);
    atomic_long_inc(&lruvec->inactive_age);
    inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
    ...
}

Workingset_activate

refault 的子集,如果真的有被帶到 active list 則 ++

others

/**
 * page_lru_base_type - which LRU list type should a page be on?
 * @page: the page to test
 *
 * Used for LRU list index arithmetic.
 *
 * Returns the base LRU type - file or anon - @page should be on.
 */
static inline enum lru_list page_lru_base_type(struct page *page)
{
	if (page_is_file_cache(page))
		return LRU_INACTIVE_FILE;
	return LRU_INACTIVE_ANON;
}

/**
 * page_lru - which LRU list should a page be on?
 * @page: the page to test
 *
 * Returns the LRU list a page should be on, as an index
 * into the array of LRU lists.
 */
static __always_inline enum lru_list page_lru(struct page *page)
{
	enum lru_list lru;

	if (PageUnevictable(page))
		lru = LRU_UNEVICTABLE;
	else {
		lru = page_lru_base_type(page);
		if (PageActive(page))
			lru += LRU_ACTIVE;
	}
	return lru;
}