📄 page-writeback.c
字号:
/* * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * * Contains functions related to writing back dirty pages at the * address_space level. * * 10Apr2002 Andrew Morton * Initial version */#include <linux/kernel.h>#include <linux/module.h>#include <linux/spinlock.h>#include <linux/fs.h>#include <linux/mm.h>#include <linux/swap.h>#include <linux/slab.h>#include <linux/pagemap.h>#include <linux/writeback.h>#include <linux/init.h>#include <linux/backing-dev.h>#include <linux/task_io_accounting_ops.h>#include <linux/blkdev.h>#include <linux/mpage.h>#include <linux/rmap.h>#include <linux/percpu.h>#include <linux/notifier.h>#include <linux/smp.h>#include <linux/sysctl.h>#include <linux/cpu.h>#include <linux/syscalls.h>#include <linux/buffer_head.h>#include <linux/pagevec.h>/* * The maximum number of pages to writeout in a single bdflush/kupdate * operation. We do this so we don't hold I_SYNC against an inode for * enormous amounts of time, which would block a userspace task which has * been forced to throttle against that inode. Also, the code reevaluates * the dirty each time it has written this many pages. */#define MAX_WRITEBACK_PAGES 1024/* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */static long ratelimit_pages = 32;/* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably * large amounts of I/O are submitted. */static inline long sync_writeback_pages(void){ return ratelimit_pages + ratelimit_pages / 2;}/* The following parameters are exported via /proc/sys/vm *//* * Start background writeback (via pdflush) at this percentage */int dirty_background_ratio = 5;/* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */unsigned long dirty_background_bytes;/* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */int vm_highmem_is_dirtyable;/* * The generator of dirty data starts writeback at this percentage */int vm_dirty_ratio = 10;/* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */unsigned long vm_dirty_bytes;/* * The interval between `kupdate'-style writebacks, in jiffies */int dirty_writeback_interval = 5 * HZ;/* * The longest number of jiffies for which data is allowed to remain dirty */int dirty_expire_interval = 30 * HZ;/* * Flag that makes the machine dump writes/reads and block dirtyings. */int block_dump;/* * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. */int laptop_mode;EXPORT_SYMBOL(laptop_mode);/* End of sysctl-exported parameters */static void background_writeout(unsigned long _min_pages);/* * Scale the writeback cache size proportional to the relative writeout speeds. * * We do this by keeping a floating proportion between BDIs, based on page * writeback completions [end_page_writeback()]. Those devices that write out * pages fastest will get the larger share, while the slower will get a smaller * share. * * We use page writeout completions because we are interested in getting rid of * dirty pages. Having them written out is the primary goal. * * We introduce a concept of time, a period over which we measure these events, * because demand can/will vary over time. The length of this period itself is * measured in page writeback completions. * */static struct prop_descriptor vm_completions;static struct prop_descriptor vm_dirties;/* * couple the period to the dirty_ratio: * * period/2 ~ roundup_pow_of_two(dirty limit) */static int calc_period_shift(void){ unsigned long dirty_total; if (vm_dirty_bytes) dirty_total = vm_dirty_bytes / PAGE_SIZE; else dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; return 2 + ilog2(dirty_total - 1);}/* * update the period when the dirty threshold changes. */static void update_completion_period(void){ int shift = calc_period_shift(); prop_change_shift(&vm_completions, shift); prop_change_shift(&vm_dirties, shift);}int dirty_background_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos){ int ret; ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret;}int dirty_background_bytes_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos){ int ret; ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret;}int dirty_ratio_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos){ int old_ratio = vm_dirty_ratio; int ret; ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { update_completion_period(); vm_dirty_bytes = 0; } return ret;}int dirty_bytes_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos){ unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { update_completion_period(); vm_dirty_ratio = 0; } return ret;}/* * Increment the BDI's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). */static inline void __bdi_writeout_inc(struct backing_dev_info *bdi){ __prop_inc_percpu_max(&vm_completions, &bdi->completions, bdi->max_prop_frac);}void bdi_writeout_inc(struct backing_dev_info *bdi){ unsigned long flags; local_irq_save(flags); __bdi_writeout_inc(bdi); local_irq_restore(flags);}EXPORT_SYMBOL_GPL(bdi_writeout_inc);void task_dirty_inc(struct task_struct *tsk){ prop_inc_single(&vm_dirties, &tsk->dirties);}/* * Obtain an accurate fraction of the BDI's portion. */static void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator){ if (bdi_cap_writeback_dirty(bdi)) { prop_fraction_percpu(&vm_completions, &bdi->completions, numerator, denominator); } else { *numerator = 0; *denominator = 1; }}/* * Clip the earned share of dirty pages to that which is actually available. * This avoids exceeding the total dirty_limit when the floating averages * fluctuate too quickly. */static voidclip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty){ long avail_dirty; avail_dirty = dirty - (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_WRITEBACK) + global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK_TEMP)); if (avail_dirty < 0) avail_dirty = 0; avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + bdi_stat(bdi, BDI_WRITEBACK); *pbdi_dirty = min(*pbdi_dirty, avail_dirty);}static inline void task_dirties_fraction(struct task_struct *tsk, long *numerator, long *denominator){ prop_fraction_single(&vm_dirties, &tsk->dirties, numerator, denominator);}/* * scale the dirty limit * * task specific dirty limit: * * dirty -= (dirty/8) * p_{t} */static void task_dirty_limit(struct task_struct *tsk, long *pdirty){ long numerator, denominator; long dirty = *pdirty; u64 inv = dirty >> 3; task_dirties_fraction(tsk, &numerator, &denominator); inv *= numerator; do_div(inv, denominator); dirty -= inv; if (dirty < *pdirty/2) dirty = *pdirty/2; *pdirty = dirty;}/* * */static DEFINE_SPINLOCK(bdi_lock);static unsigned int bdi_min_ratio;int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio){ int ret = 0; unsigned long flags; spin_lock_irqsave(&bdi_lock, flags); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { min_ratio -= bdi->min_ratio; if (bdi_min_ratio + min_ratio < 100) { bdi_min_ratio += min_ratio; bdi->min_ratio += min_ratio; } else { ret = -EINVAL; } } spin_unlock_irqrestore(&bdi_lock, flags); return ret;}int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio){ unsigned long flags; int ret = 0; if (max_ratio > 100) return -EINVAL; spin_lock_irqsave(&bdi_lock, flags); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; } spin_unlock_irqrestore(&bdi_lock, flags); return ret;}EXPORT_SYMBOL(bdi_set_max_ratio);/* * Work out the current dirty-memory clamping and background writeout * thresholds. * * The main aim here is to lower them aggressively if there is a lot of mapped * memory around. To avoid stressing page reclaim with lots of unreclaimable * pages. It is better to clamp down on writers than to start swapping, and * performing lots of scanning. * * We only allow 1/2 of the currently-unmapped memory to be dirtied. * * We don't permit the clamping level to fall below 5% - that is getting rather * excessive. * * We make sure that the background writeout level is below the adjusted * clamping level. */static unsigned long highmem_dirtyable_memory(unsigned long total){#ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; for_each_node_state(node, N_HIGH_MEMORY) { struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); } /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure * that this does not occur. */ return min(x, total);#else return 0;#endif}/** * determine_dirtyable_memory - amount of memory that may be used * * Returns the numebr of pages that can currently be freed and used * by the kernel for direct mappings. */unsigned long determine_dirtyable_memory(void){ unsigned long x; x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); return x + 1; /* Ensure that we never return 0 */}voidget_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, unsigned long *pbdi_dirty, struct backing_dev_info *bdi){ unsigned long background; unsigned long dirty; unsigned long available_memory = determine_dirtyable_memory(); struct task_struct *tsk; if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else { int dirty_ratio; dirty_ratio = vm_dirty_ratio; if (dirty_ratio < 5) dirty_ratio = 5; dirty = (dirty_ratio * available_memory) / 100; } if (dirty_background_bytes) background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); else background = (dirty_background_ratio * available_memory) / 100; if (background >= dirty) background = dirty / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { background += background / 4; dirty += dirty / 4; } *pbackground = background; *pdirty = dirty; if (bdi) { u64 bdi_dirty; long numerator, denominator; /* * Calculate this BDI's share of the dirty ratio. */ bdi_writeout_fraction(bdi, &numerator, &denominator); bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; bdi_dirty *= numerator; do_div(bdi_dirty, denominator); bdi_dirty += (dirty * bdi->min_ratio) / 100; if (bdi_dirty > (dirty * bdi->max_ratio) / 100) bdi_dirty = dirty * bdi->max_ratio / 100; *pbdi_dirty = bdi_dirty; clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); task_dirty_limit(current, pbdi_dirty); }}/*
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -