📄 vmscan.c
字号:
* OK, so we have swap space and a fair amount of page cache * pages. We use the recently rotated / recently scanned * ratios to determine how valuable each cache is. * * Because workloads change over time (and to avoid overflow) * we keep these statistics as a floating average, which ends * up weighing recent references more than old ones. * * anon in [0], file in [1] */ if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; spin_unlock_irq(&zone->lru_lock); } if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[1] /= 2; reclaim_stat->recent_rotated[1] /= 2; spin_unlock_irq(&zone->lru_lock); } /* * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ anon_prio = sc->swappiness; file_prio = 200 - sc->swappiness; /* * The amount of pressure on anon vs file pages is inversely * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. */ ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); ap /= reclaim_stat->recent_rotated[0] + 1; fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; /* Normalize to percentages */ percent[0] = 100 * ap / (ap + fp + 1); percent[1] = 100 - percent[0];}/* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */static void shrink_zone(int priority, struct zone *zone, struct scan_control *sc){ unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long swap_cluster_max = sc->swap_cluster_max; get_scan_ratio(zone, sc, percent); for_each_evictable_lru(l) { int file = is_file_lru(l); int scan; scan = zone_nr_pages(zone, sc, l); if (priority) { scan >>= priority; scan = (scan * percent[file]) / 100; } if (scanning_global_lru(sc)) { zone->lru[l].nr_scan += scan; nr[l] = zone->lru[l].nr_scan; if (nr[l] >= swap_cluster_max) zone->lru[l].nr_scan = 0; else nr[l] = 0; } else nr[l] = scan; } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { if (nr[l]) { nr_to_scan = min(nr[l], swap_cluster_max); nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, priority); } } /* * On large memory systems, scan >> priority can become * really large. This is fine for the starting priority; * we want to put equal scanning pressure on each zone. * However, if the VM has a harder time of freeing pages, * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ if (nr_reclaimed > swap_cluster_max && priority < DEF_PRIORITY && !current_is_kswapd()) break; } sc->nr_reclaimed = nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask);}/* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation * request. * * We reclaim from a zone even if that zone is over pages_high. Because: * a) The caller may be trying to free *extra* pages to satisfy a higher-order * allocation or * b) The zones may be over pages_high but they must go *over* pages_high to * satisfy the `incremental min' zone defense algorithm. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc){ enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); struct zoneref *z; struct zone *zone; sc->all_unreclaimable = 1; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { if (!populated_zone(zone)) continue; /* * Take care memory controller reclaiming has small influence * to global LRU. */ if (scanning_global_lru(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; note_zone_scanning_priority(zone, priority); if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ sc->all_unreclaimable = 0; } else { /* * Ignore cpuset limitation here. We just want to reduce * # of used pages by us regardless of memory shortage. */ sc->all_unreclaimable = 0; mem_cgroup_note_reclaim_priority(sc->mem_cgroup, priority); } shrink_zone(priority, zone, sc); }}/* * This is the main entry point to direct page reclaim. * * If a full scan of the inactive list fails to free enough memory then we * are "out of memory" and something needs to be killed. * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this * caller can't do much about. We kick pdflush and take explicit naps in the * hope that some of these pages can be written. But if the allocating task * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed */static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc){ int priority; unsigned long ret = 0; unsigned long total_scanned = 0; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); delayacct_freepages_start(); if (scanning_global_lru(sc)) count_vm_event(ALLOCSTALL); /* * mem_cgroup will not do shrink_slab. */ if (scanning_global_lru(sc)) { for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; lru_pages += zone_lru_pages(zone); } } for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; if (!priority) disable_swap_token(); shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups */ if (scanning_global_lru(sc)) { shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } } total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->swap_cluster_max) { ret = sc->nr_reclaimed; goto out; } /* * Try to write back as many pages as we just scanned. This * tends to cause slow streaming writers to write data to the * disk smoothly, at the dirtying rate, which is nice. But * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ if (total_scanned > sc->swap_cluster_max + sc->swap_cluster_max / 2) { wakeup_pdflush(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ if (sc->nr_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scanning_global_lru(sc)) ret = sc->nr_reclaimed;out: /* * Now that we've scanned all the zones at this priority level, note * that level within the zone so that the next thread which performs * scanning of this zone will immediately start out at this priority * level. This affects only the decision whether or not to bring * mapped pages onto the inactive list. */ if (priority < 0) priority = 0; if (scanning_global_lru(sc)) { for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; zone->prev_priority = priority; } } else mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); delayacct_freepages_end(); return ret;}unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask){ struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, .may_swap = 1, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, .isolate_pages = isolate_pages_global, }; return do_try_to_free_pages(zonelist, &sc);}#ifdef CONFIG_CGROUP_MEM_RES_CTLRunsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, bool noswap, unsigned int swappiness){ struct scan_control sc = { .may_writepage = !laptop_mode, .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, }; struct zonelist *zonelist; if (noswap) sc.may_swap = 0; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists; return do_try_to_free_pages(zonelist, &sc);}#endif/* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at pages_high. * * Returns the number of pages which were actually freed. * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. * What we do is to detect the case where all pages in the zone have been * scanned twice and there has been zero successful reclaim. Mark the zone as * dead and from now on, only perform a short scan. Basically we're polling * the zone for when the problem goes away. * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > pages_high, but once a zone is found to have * free_pages <= pages_high, we scan that zone and the lower zones regardless * of the number of free pages in the lower zones. This interoperates with * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */static unsigned long balance_pgdat(pg_data_t *pgdat, int order){ int all_zones_ok; int priority; int i; unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, .isolate_pages = isolate_pages_global, }; /* * temp_priority is used to remember the scanning priority at which * this zone was successfully refilled to free_pages == pages_high. */ int temp_priority[MAX_NR_ZONES];loop_again: total_scanned = 0; sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); for (i = 0; i < pgdat->nr_zones; i++) temp_priority[i] = DEF_PRIORITY; for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; /* The swap token gets in the way of swapout... */ if (!priority) disable_swap_token(); all_zones_ok = 1; /* * Scan in the highmem->dma direction for the highest * zone which needs scanning */ for (i = pgdat->nr_zones - 1; i >= 0; i--) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) continue; if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) continue; /* * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. */ if (inactive_anon_is_low(zone, &sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); if (!zone_watermark_ok(zone, order, zone->pages_high, 0, 0)) { end_zone = i; break; } } if (i < 0) goto out; for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; lru_pages += zone_lru_pages(zone); } /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. * * We do this because the page allocator works in the opposite * direction. This prevents the page allocator from allocating * pages behind kswapd's direction of progress, which would * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; if (!populated_zone(zone)) continue; if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) continue; if (!zone_watermark_ok(zone, order, zone->pages_high, end_zone, 0)) all_zones_ok = 0; temp_priority[i] = priority; sc.nr_scanned = 0; note_zone_scanning_priority(zone, priority); /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ if (!zone_watermark_ok(zone, order, 8*zone->pages_high, end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone_is_all_unreclaimable(zone))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -