📄 filemap.c
字号:
/* * linux/mm/filemap.c * * Copyright (C) 1994-1999 Linus Torvalds *//* * This file handles the generic file mmap semantics used by * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */#include <linux/module.h>#include <linux/slab.h>#include <linux/compiler.h>#include <linux/fs.h>#include <linux/uaccess.h>#include <linux/aio.h>#include <linux/capability.h>#include <linux/kernel_stat.h>#include <linux/mm.h>#include <linux/swap.h>#include <linux/mman.h>#include <linux/pagemap.h>#include <linux/file.h>#include <linux/uio.h>#include <linux/hash.h>#include <linux/writeback.h>#include <linux/backing-dev.h>#include <linux/pagevec.h>#include <linux/blkdev.h>#include <linux/security.h>#include <linux/syscalls.h>#include <linux/cpuset.h>#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */#include <linux/memcontrol.h>#include <linux/mm_inline.h> /* for page_is_file_cache() */#include "internal.h"/* * FIXME: remove all knowledge of the buffer layer from the core VM */#include <linux/buffer_head.h> /* for generic_osync_inode */#include <asm/mman.h>/* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. * * Shared mappings now work. 15.8.1995 Bruno. * * finished 'unifying' the page and buffer cache and SMP-threaded the * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> * * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> *//* * Lock ordering: * * ->i_mmap_lock (vmtruncate) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex * ->i_mmap_lock (truncate->unmap_mapping_range) * * ->mmap_sem * ->i_mmap_lock * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem * ->lock_page (access_process_vm) * * ->i_mutex (generic_file_buffered_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * * ->i_mutex * ->i_alloc_sem (various) * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * * ->i_mmap_lock * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) * * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) * ->zone.lru_lock (follow_page->mark_page_accessed) * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * * ->task->proc_lock * ->dcache_lock (proc_pid_lookup) *//* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock. */void __remove_from_page_cache(struct page *page){ struct address_space *mapping = page->mapping; radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); mem_cgroup_uncharge_cache_page(page); /* * Some filesystems seem to re-dirty the page even after * the VM has canceled the dirty bit (eg ext3 journaling). * * Fix it up by doing a final dirty accounting check after * having removed the page entirely. */ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { dec_zone_page_state(page, NR_FILE_DIRTY); dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); }}void remove_from_page_cache(struct page *page){ struct address_space *mapping = page->mapping; BUG_ON(!PageLocked(page)); spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock);}static int sync_page(void *word){ struct address_space *mapping; struct page *page; page = container_of((unsigned long *)word, struct page, flags); /* * page_mapping() is being called without PG_locked held. * Some knowledge of the state and use of the page is used to * reduce the requirements down to a memory barrier. * The danger here is of a stale page_mapping() return value * indicating a struct address_space different from the one it's * associated with when it is associated with one. * After smp_mb(), it's either the correct page_mapping() for * the page, or an old page_mapping() and the page's own * page_mapping() has gone NULL. * The ->sync_page() address_space operation must tolerate * page_mapping() going NULL. By an amazing coincidence, * this comes about because none of the users of the page * in the ->sync_page() methods make essential use of the * page_mapping(), merely passing the page down to the backing * device's unplug functions when it's non-NULL, which in turn * ignore it for all cases but swap, where only page_private(page) is * of interest. When page_mapping() does go NULL, the entire * call stack gracefully ignores the page and returns. * -- wli */ smp_mb(); mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) mapping->a_ops->sync_page(page); io_schedule(); return 0;}static int sync_page_killable(void *word){ sync_page(word); return fatal_signal_pending(current) ? -EINTR : 0;}/** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. */int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode){ int ret; struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; if (!mapping_cap_writeback_dirty(mapping)) return 0; ret = do_writepages(mapping, &wbc); return ret;}static inline int __filemap_fdatawrite(struct address_space *mapping, int sync_mode){ return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);}int filemap_fdatawrite(struct address_space *mapping){ return __filemap_fdatawrite(mapping, WB_SYNC_ALL);}EXPORT_SYMBOL(filemap_fdatawrite);int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end){ return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);}EXPORT_SYMBOL(filemap_fdatawrite_range);/** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. */int filemap_flush(struct address_space *mapping){ return __filemap_fdatawrite(mapping, WB_SYNC_NONE);}EXPORT_SYMBOL(filemap_flush);/** * wait_on_page_writeback_range - wait for writeback to complete * @mapping: target address_space * @start: beginning page index * @end: ending page index * * Wait for writeback to complete against pages indexed by start->end * inclusive */int wait_on_page_writeback_range(struct address_space *mapping, pgoff_t start, pgoff_t end){ struct pagevec pvec; int nr_pages; int ret = 0; pgoff_t index; if (end < start) return 0; pagevec_init(&pvec, 0); index = start; while ((index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_WRITEBACK, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { unsigned i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* until radix tree lookup accepts end_index */ if (page->index > end) continue; wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; } pagevec_release(&pvec); cond_resched(); } /* Check for outstanding write errors */ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ret = -ENOSPC; if (test_and_clear_bit(AS_EIO, &mapping->flags)) ret = -EIO; return ret;}/** * sync_page_range - write and wait on all pages in the passed range * @inode: target inode * @mapping: target address_space * @pos: beginning offset in pages to write * @count: number of bytes to write * * Write and wait upon all the pages in the passed range. This is a "data * integrity" operation. It waits upon in-flight writeout before starting and * waiting upon new writeout. If there was an IO error, return it. * * We need to re-take i_mutex during the generic_osync_inode list walk because * it is otherwise livelockable. */int sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, loff_t count){ pgoff_t start = pos >> PAGE_CACHE_SHIFT; pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; int ret; if (!mapping_cap_writeback_dirty(mapping) || !count) return 0; ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); if (ret == 0) { mutex_lock(&inode->i_mutex); ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); mutex_unlock(&inode->i_mutex); } if (ret == 0) ret = wait_on_page_writeback_range(mapping, start, end); return ret;}EXPORT_SYMBOL(sync_page_range);/** * sync_page_range_nolock - write & wait on all pages in the passed range without locking * @inode: target inode * @mapping: target address_space * @pos: beginning offset in pages to write * @count: number of bytes to write * * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea * as it forces O_SYNC writers to different parts of the same file * to be serialised right until io completion. */int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, loff_t pos, loff_t count){ pgoff_t start = pos >> PAGE_CACHE_SHIFT; pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; int ret; if (!mapping_cap_writeback_dirty(mapping) || !count) return 0; ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); if (ret == 0) ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); if (ret == 0) ret = wait_on_page_writeback_range(mapping, start, end); return ret;}EXPORT_SYMBOL(sync_page_range_nolock);/** * filemap_fdatawait - wait for all under-writeback pages to complete * @mapping: address space structure to wait for * * Walk the list of under-writeback pages of the given address space * and wait for all of them. */int filemap_fdatawait(struct address_space *mapping){ loff_t i_size = i_size_read(mapping->host); if (i_size == 0) return 0; return wait_on_page_writeback_range(mapping, 0, (i_size - 1) >> PAGE_CACHE_SHIFT);}EXPORT_SYMBOL(filemap_fdatawait);int filemap_write_and_wait(struct address_space *mapping){ int err = 0; if (mapping->nrpages) { err = filemap_fdatawrite(mapping); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ if (err != -EIO) { int err2 = filemap_fdatawait(mapping); if (!err) err = err2; } } return err;}EXPORT_SYMBOL(filemap_write_and_wait);/** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that `lend' is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). */int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend){ int err = 0; if (mapping->nrpages) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ if (err != -EIO) { int err2 = wait_on_page_writeback_range(mapping, lstart >> PAGE_CACHE_SHIFT, lend >> PAGE_CACHE_SHIFT); if (!err) err = err2; } } return err;}/** * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space * @offset: page index * @gfp_mask: page allocation mode * * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. */int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask){ int error; VM_BUG_ON(!PageLocked(page)); error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); if (error) goto out; error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { page_cache_get(page); page->mapping = mapping; page->index = offset; spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); } else { page->mapping = NULL; mem_cgroup_uncharge_cache_page(page); page_cache_release(page); } spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else mem_cgroup_uncharge_cache_page(page);out: return error;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -