📄 vmscan.c
字号:
continue; if (nr_slab == 0 && zone->pages_scanned >= (zone_lru_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* * If we've done a decent amount of scanning and * the reclaim ratio is low, start doing writepage * even in laptop mode */ if (total_scanned > SWAP_CLUSTER_MAX * 2 && total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; } if (all_zones_ok) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ if (total_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); /* * We do this so kswapd doesn't build up large priorities for * example when it is freeing in parallel with allocators. It * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) break; }out: /* * Note within each zone the priority level at which this zone was * brought into a happy state. So that the next thread which scans this * zone will start out at that priority level. */ for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; zone->prev_priority = temp_priority[i]; } if (!all_zones_ok) { cond_resched(); try_to_freeze(); /* * Fragmentation may mean that the system cannot be * rebalanced for high-order allocations in all zones. * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, * it means the zones have been fully scanned and are still * not balanced. For high-order allocations, there is * little point trying all over again as kswapd may * infinite loop. * * Instead, recheck all watermarks at order-0 as they * are the most important. If watermarks are ok, kswapd will go * back to sleep. High-order users can still perform direct * reclaim if they wish. */ if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) order = sc.order = 0; goto loop_again; } return sc.nr_reclaimed;}/* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */static int kswapd(void *p){ unsigned long order; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; DEFINE_WAIT(wait); struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; node_to_cpumask_ptr(cpumask, pgdat->node_id); if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state; /* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it * regardless (see "__alloc_pages()"). "kswapd" should * never get caught in the normal page freeing logic. * * (Kswapd normally doesn't need memory anyway, but sometimes * you need a small amount of memory in order to be able to * page out something else, and this flag essentially protects * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); order = 0; for ( ; ; ) { unsigned long new_order; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; pgdat->kswapd_max_order = 0; if (order < new_order) { /* * Don't sleep if someone wants a larger 'order' * allocation */ order = new_order; } else { if (!freezing(current)) schedule(); order = pgdat->kswapd_max_order; } finish_wait(&pgdat->kswapd_wait, &wait); if (!try_to_freeze()) { /* We can speed up thawing tasks if we don't call * balance_pgdat after returning from the refrigerator */ balance_pgdat(pgdat, order); } } return 0;}/* * A zone is low on free memory, so wake its kswapd task to service it. */void wakeup_kswapd(struct zone *zone, int order){ pg_data_t *pgdat; if (!populated_zone(zone)) return; pgdat = zone->zone_pgdat; if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; if (!waitqueue_active(&pgdat->kswapd_wait)) return; wake_up_interruptible(&pgdat->kswapd_wait);}unsigned long global_lru_pages(void){ return global_page_state(NR_ACTIVE_ANON) + global_page_state(NR_ACTIVE_FILE) + global_page_state(NR_INACTIVE_ANON) + global_page_state(NR_INACTIVE_FILE);}#ifdef CONFIG_PM/* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages * from LRU lists system-wide, for given pass and priority, and returns the * number of reclaimed pages * * For pass > 3 we also try to shrink the LRU lists that contain a few pages */static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, int pass, struct scan_control *sc){ struct zone *zone; unsigned long ret = 0; for_each_zone(zone) { enum lru_list l; if (!populated_zone(zone)) continue; if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; for_each_evictable_lru(l) { enum zone_stat_item ls = NR_LRU_BASE + l; unsigned long lru_pages = zone_page_state(zone, ls); /* For pass = 0, we don't shrink the active list */ if (pass == 0 && (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE)) continue; zone->lru[l].nr_scan += (lru_pages >> prio) + 1; if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { unsigned long nr_to_scan; zone->lru[l].nr_scan = 0; nr_to_scan = min(nr_pages, lru_pages); ret += shrink_list(l, nr_to_scan, zone, sc, prio); if (ret >= nr_pages) return ret; } } } return ret;}/* * Try to free `nr_pages' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */unsigned long shrink_all_memory(unsigned long nr_pages){ unsigned long lru_pages, nr_slab; unsigned long ret = 0; int pass; struct reclaim_state reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_swap = 0, .swap_cluster_max = nr_pages, .may_writepage = 1, .isolate_pages = isolate_pages_global, }; current->reclaim_state = &reclaim_state; lru_pages = global_lru_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { reclaim_state.reclaimed_slab = 0; shrink_slab(nr_pages, sc.gfp_mask, lru_pages); if (!reclaim_state.reclaimed_slab) break; ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; nr_slab -= reclaim_state.reclaimed_slab; } /* * We try to shrink LRUs in 5 passes: * 0 = Reclaim from inactive_list only * 1 = Reclaim from active list but don't reclaim mapped * 2 = 2nd pass of type 1 * 3 = Reclaim mapped (normal reclaim) * 4 = 2nd pass of type 3 */ for (pass = 0; pass < 5; pass++) { int prio; /* Force reclaiming mapped pages in the passes #3 and #4 */ if (pass > 2) sc.may_swap = 1; for (prio = DEF_PRIORITY; prio >= 0; prio--) { unsigned long nr_to_scan = nr_pages - ret; sc.nr_scanned = 0; ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); if (ret >= nr_pages) goto out; reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, global_lru_pages()); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ / 10); } } /* * If ret = 0, we could not shrink LRUs, but there may be something * in slab caches */ if (!ret) { do { reclaim_state.reclaimed_slab = 0; shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); }out: current->reclaim_state = NULL; return ret;}#endif/* It's optimal to keep kswapds on the same CPUs as their memory, but not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu){ int nid; if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { for_each_node_state(nid, N_HIGH_MEMORY) { pg_data_t *pgdat = NODE_DATA(nid); node_to_cpumask_ptr(mask, pgdat->node_id); if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) /* One of our CPUs online: restore mask */ set_cpus_allowed_ptr(pgdat->kswapd, mask); } } return NOTIFY_OK;}/* * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */int kswapd_run(int nid){ pg_data_t *pgdat = NODE_DATA(nid); int ret = 0; if (pgdat->kswapd) return 0; pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); printk("Failed to start kswapd on node %d\n",nid); ret = -1; } return ret;}static int __init kswapd_init(void){ int nid; swap_setup(); for_each_node_state(nid, N_HIGH_MEMORY) kswapd_run(nid); hotcpu_notifier(cpu_callback, 0); return 0;}module_init(kswapd_init)#ifdef CONFIG_NUMA/* * Zone reclaim mode * * If non-zero call zone_reclaim when the number of free pages falls below * the watermarks. */int zone_reclaim_mode __read_mostly;#define RECLAIM_OFF 0#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim *//* * Priority for ZONE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. */#define ZONE_RECLAIM_PRIORITY 4/* * Percentage of pages in a zone that must be unmapped for zone_reclaim to * occur. */int sysctl_min_unmapped_ratio = 1;/* * If the number of slab pages in a zone grows beyond this percentage then * slab reclaim needs to occur. */int sysctl_min_slab_ratio = 5;/* * Try to free up some pages from this zone through reclaim. */static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order){ /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; int priority; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .isolate_pages = isolate_pages_global, }; unsigned long slab_reclaimable; disable_swap_token(); cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP * and we also need to be able to write out pages for RECLAIM_WRITE * and RECLAIM_SWAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; if (zone_page_state(zone, NR_FILE_PAGES) - zone_page_state(zone, NR_FILE_MAPPED) > zone->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. */ priority = ZONE_RECLAIM_PRIORITY; do { note_zone_scanning_priority(zone, priority); shrink_zone(priority, zone, &sc); priority--; } while (priority >= 0 && sc.nr_reclaimed < nr_pages); } slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); if (slab_reclaimable > zone->min_slab_pages) { /* * shrink_slab() does not currently allow us to determine how * many pages were freed in this zone. So we take the current * number of slab pages and shake the slab until it is reduced * by the same nr_pages that we used for reclaiming unmapped * pages. * * Note that shrink_slab will free memory on all zones and may * take a
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -