📄 xfs_buf.c
字号:
/* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */#include <linux/stddef.h>#include <linux/errno.h>#include <linux/slab.h>#include <linux/pagemap.h>#include <linux/init.h>#include <linux/vmalloc.h>#include <linux/bio.h>#include <linux/sysctl.h>#include <linux/proc_fs.h>#include <linux/workqueue.h>#include <linux/percpu.h>#include <linux/blkdev.h>#include <linux/hash.h>#include <linux/kthread.h>#include "xfs_linux.h"STATIC kmem_zone_t *xfs_buf_zone;STATIC kmem_shaker_t xfs_buf_shake;STATIC int xfsbufd(void *);STATIC int xfsbufd_wakeup(int, gfp_t);STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);STATIC struct workqueue_struct *xfslogd_workqueue;struct workqueue_struct *xfsdatad_workqueue;#ifdef XFS_BUF_TRACEvoidxfs_buf_trace( xfs_buf_t *bp, char *id, void *data, void *ra){ ktrace_enter(xfs_buf_trace_buf, bp, id, (void *)(unsigned long)bp->b_flags, (void *)(unsigned long)bp->b_hold.counter, (void *)(unsigned long)bp->b_sema.count.counter, (void *)current, data, ra, (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), (void *)(unsigned long)(bp->b_file_offset & 0xffffffff), (void *)(unsigned long)bp->b_buffer_length, NULL, NULL, NULL, NULL, NULL);}ktrace_t *xfs_buf_trace_buf;#define XFS_BUF_TRACE_SIZE 4096#define XB_TRACE(bp, id, data) \ xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))#else#define XB_TRACE(bp, id, data) do { } while (0)#endif#ifdef XFS_BUF_LOCK_TRACKING# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)# define XB_GET_OWNER(bp) ((bp)->b_last_holder)#else# define XB_SET_OWNER(bp) do { } while (0)# define XB_CLEAR_OWNER(bp) do { } while (0)# define XB_GET_OWNER(bp) do { } while (0)#endif#define xb_to_gfp(flags) \ ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)#define xb_to_km(flags) \ (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)#define xfs_buf_allocate(flags) \ kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))#define xfs_buf_deallocate(bp) \ kmem_zone_free(xfs_buf_zone, (bp));/* * Page Region interfaces. * * For pages in filesystems where the blocksize is smaller than the * pagesize, we use the page->private field (long) to hold a bitmap * of uptodate regions within the page. * * Each such region is "bytes per page / bits per long" bytes long. * * NBPPR == number-of-bytes-per-page-region * BTOPR == bytes-to-page-region (rounded up) * BTOPRT == bytes-to-page-region-truncated (rounded down) */#if (BITS_PER_LONG == 32)#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */#elif (BITS_PER_LONG == 64)#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */#else#error BITS_PER_LONG must be 32 or 64#endif#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))STATIC unsigned longpage_region_mask( size_t offset, size_t length){ unsigned long mask; int first, final; first = BTOPR(offset); final = BTOPRT(offset + length - 1); first = min(first, final); mask = ~0UL; mask <<= BITS_PER_LONG - (final - first); mask >>= BITS_PER_LONG - (final); ASSERT(offset + length <= PAGE_CACHE_SIZE); ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); return mask;}STATIC inline voidset_page_region( struct page *page, size_t offset, size_t length){ set_page_private(page, page_private(page) | page_region_mask(offset, length)); if (page_private(page) == ~0UL) SetPageUptodate(page);}STATIC inline inttest_page_region( struct page *page, size_t offset, size_t length){ unsigned long mask = page_region_mask(offset, length); return (mask && (page_private(page) & mask) == mask);}/* * Mapping of multi-page buffers into contiguous virtual space */typedef struct a_list { void *vm_addr; struct a_list *next;} a_list_t;STATIC a_list_t *as_free_head;STATIC int as_list_len;STATIC DEFINE_SPINLOCK(as_lock);/* * Try to batch vunmaps because they are costly. */STATIC voidfree_address( void *addr){ a_list_t *aentry; aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); if (likely(aentry)) { spin_lock(&as_lock); aentry->next = as_free_head; aentry->vm_addr = addr; as_free_head = aentry; as_list_len++; spin_unlock(&as_lock); } else { vunmap(addr); }}STATIC voidpurge_addresses(void){ a_list_t *aentry, *old; if (as_free_head == NULL) return; spin_lock(&as_lock); aentry = as_free_head; as_free_head = NULL; as_list_len = 0; spin_unlock(&as_lock); while ((old = aentry) != NULL) { vunmap(aentry->vm_addr); aentry = aentry->next; kfree(old); }}/* * Internal xfs_buf_t object manipulation */STATIC void_xfs_buf_initialize( xfs_buf_t *bp, xfs_buftarg_t *target, xfs_off_t range_base, size_t range_length, xfs_buf_flags_t flags){ /* * We don't want certain flags to appear in b_flags. */ flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); memset(bp, 0, sizeof(xfs_buf_t)); atomic_set(&bp->b_hold, 1); init_MUTEX_LOCKED(&bp->b_iodonesema); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_hash_list); init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; bp->b_file_offset = range_base; /* * Set buffer_length and count_desired to the same value initially. * I/O routines should use count_desired, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ bp->b_buffer_length = bp->b_count_desired = range_length; bp->b_flags = flags; bp->b_bn = XFS_BUF_DADDR_NULL; atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); XFS_STATS_INC(xb_create); XB_TRACE(bp, "initialize", target);}/* * Allocate a page array capable of holding a specified number * of pages, and point the page buf at it. */STATIC int_xfs_buf_get_pages( xfs_buf_t *bp, int page_count, xfs_buf_flags_t flags){ /* Make sure that we have a page list */ if (bp->b_pages == NULL) { bp->b_offset = xfs_buf_poff(bp->b_file_offset); bp->b_page_count = page_count; if (page_count <= XB_PAGES) { bp->b_pages = bp->b_page_array; } else { bp->b_pages = kmem_alloc(sizeof(struct page *) * page_count, xb_to_km(flags)); if (bp->b_pages == NULL) return -ENOMEM; } memset(bp->b_pages, 0, sizeof(struct page *) * page_count); } return 0;}/* * Frees b_pages if it was allocated. */STATIC void_xfs_buf_free_pages( xfs_buf_t *bp){ if (bp->b_pages != bp->b_page_array) { kmem_free(bp->b_pages, bp->b_page_count * sizeof(struct page *)); }}/* * Releases the specified buffer. * * The modification state of any associated pages is left unchanged. * The buffer most not be on any hash - use xfs_buf_rele instead for * hashed and refcounted buffers */voidxfs_buf_free( xfs_buf_t *bp){ XB_TRACE(bp, "free", 0); ASSERT(list_empty(&bp->b_hash_list)); if (bp->b_flags & _XBF_PAGE_CACHE) { uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) free_address(bp->b_addr - bp->b_offset); for (i = 0; i < bp->b_page_count; i++) page_cache_release(bp->b_pages[i]); _xfs_buf_free_pages(bp); } else if (bp->b_flags & _XBF_KMEM_ALLOC) { /* * XXX(hch): bp->b_count_desired might be incorrect (see * xfs_buf_associate_memory for details), but fortunately * the Linux version of kmem_free ignores the len argument.. */ kmem_free(bp->b_addr, bp->b_count_desired); _xfs_buf_free_pages(bp); } xfs_buf_deallocate(bp);}/* * Finds all pages for buffer in question and builds it's page list. */STATIC int_xfs_buf_lookup_pages( xfs_buf_t *bp, uint flags){ struct address_space *mapping = bp->b_target->bt_mapping; size_t blocksize = bp->b_target->bt_bsize; size_t size = bp->b_count_desired; size_t nbytes, offset; gfp_t gfp_mask = xb_to_gfp(flags); unsigned short page_count, i; pgoff_t first; xfs_off_t end; int error; end = bp->b_file_offset + bp->b_buffer_length; page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; bp->b_flags |= _XBF_PAGE_CACHE; offset = bp->b_offset; first = bp->b_file_offset >> PAGE_CACHE_SHIFT; for (i = 0; i < bp->b_page_count; i++) { struct page *page; uint retries = 0; retry: page = find_or_create_page(mapping, first + i, gfp_mask); if (unlikely(page == NULL)) { if (flags & XBF_READ_AHEAD) { bp->b_page_count = i; for (i = 0; i < bp->b_page_count; i++) unlock_page(bp->b_pages[i]); return -ENOMEM; } /* * This could deadlock. * * But until all the XFS lowlevel code is revamped to * handle buffer allocation failures we can't do much. */ if (!(++retries % 100)) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", __FUNCTION__, gfp_mask); XFS_STATS_INC(xb_page_retries); xfsbufd_wakeup(0, gfp_mask); blk_congestion_wait(WRITE, HZ/50); goto retry; } XFS_STATS_INC(xb_page_found); nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); size -= nbytes; if (!PageUptodate(page)) { page_count--; if (blocksize >= PAGE_CACHE_SIZE) { if (flags & XBF_READ) bp->b_locked = 1; } else if (!PagePrivate(page)) { if (test_page_region(page, offset, nbytes)) page_count++; } } bp->b_pages[i] = page; offset = 0; } if (!bp->b_locked) { for (i = 0; i < bp->b_page_count; i++) unlock_page(bp->b_pages[i]); } if (page_count == bp->b_page_count) bp->b_flags |= XBF_DONE; XB_TRACE(bp, "lookup_pages", (long)page_count); return error;}/* * Map buffer into kernel address-space if nessecary. */STATIC int_xfs_buf_map_pages( xfs_buf_t *bp, uint flags){ /* A single page buffer is always mappable */ if (bp->b_page_count == 1) { bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { if (as_list_len > 64) purge_addresses(); bp->b_addr = vmap(bp->b_pages, bp->b_page_count, VM_MAP, PAGE_KERNEL); if (unlikely(bp->b_addr == NULL)) return -ENOMEM; bp->b_addr += bp->b_offset; bp->b_flags |= XBF_MAPPED; } return 0;}/* * Finding and Reading Buffers *//* * Look up, and creates if absent, a lockable buffer for * a given range of an inode. The buffer is returned * locked. If other overlapping buffers exist, they are * released before the new buffer is created and locked, * which may imply that this call will block until those buffers * are unlocked. No I/O is implied by this call. */xfs_buf_t *_xfs_buf_find( xfs_buftarg_t *btp, /* block device target */ xfs_off_t ioff, /* starting offset of range */ size_t isize, /* length of range */ xfs_buf_flags_t flags, xfs_buf_t *new_bp){ xfs_off_t range_base; size_t range_length; xfs_bufhash_t *hash; xfs_buf_t *bp, *n; range_base = (ioff << BBSHIFT); range_length = (isize << BBSHIFT); /* Check for IOs smaller than the sector size / not sector aligned */ ASSERT(!(range_length < (1 << btp->bt_sshift))); ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; spin_lock(&hash->bh_lock); list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { ASSERT(btp == bp->b_target); if (bp->b_file_offset == range_base && bp->b_buffer_length == range_length) { /* * If we look at something, bring it to the * front of the list for next time. */ atomic_inc(&bp->b_hold); list_move(&bp->b_hash_list, &hash->bh_list); goto found; } } /* No match found */ if (new_bp) { _xfs_buf_initialize(new_bp, btp, range_base, range_length, flags); new_bp->b_hash = hash; list_add(&new_bp->b_hash_list, &hash->bh_list); } else { XFS_STATS_INC(xb_miss_locked); } spin_unlock(&hash->bh_lock); return new_bp;found: spin_unlock(&hash->bh_lock); /* Attempt to get the semaphore without sleeping, * if this does not work then we need to drop the * spinlock and do a hard attempt on the semaphore. */ if (down_trylock(&bp->b_sema)) { if (!(flags & XBF_TRYLOCK)) { /* wait for buffer ownership */ XB_TRACE(bp, "get_lock", 0); xfs_buf_lock(bp); XFS_STATS_INC(xb_get_locked_waited); } else { /* We asked for a trylock and failed, no need * to look at file offset and length here, we * know that this buffer at least overlaps our * buffer and is locked, therefore our buffer * either does not exist, or is this buffer. */ xfs_buf_rele(bp); XFS_STATS_INC(xb_busy_locked); return NULL; } } else { /* trylock worked */ XB_SET_OWNER(bp); } if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); bp->b_flags &= XBF_MAPPED; } XB_TRACE(bp, "got_lock", 0); XFS_STATS_INC(xb_get_locked); return bp;}/* * Assembles a buffer covering the specified range. * Storage in memory for all portions of the buffer will be allocated, * although backing storage may not be. */xfs_buf_t *xfs_buf_get_flags( xfs_buftarg_t *target,/* target for buffer */ xfs_off_t ioff, /* starting offset of range */ size_t isize, /* length of range */ xfs_buf_flags_t flags){ xfs_buf_t *bp, *new_bp; int error = 0, i; new_bp = xfs_buf_allocate(flags); if (unlikely(!new_bp)) return NULL; bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); if (bp == new_bp) { error = _xfs_buf_lookup_pages(bp, flags); if (error) goto no_buffer; } else { xfs_buf_deallocate(new_bp); if (unlikely(bp == NULL)) return NULL; } for (i = 0; i < bp->b_page_count; i++) mark_page_accessed(bp->b_pages[i]); if (!(bp->b_flags & XBF_MAPPED)) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { printk(KERN_WARNING "%s: failed to map pages\n", __FUNCTION__); goto no_buffer; } } XFS_STATS_INC(xb_get); /* * Always fill in the block number now, the mapped cases can do * their own overlay of this later. */ bp->b_bn = ioff; bp->b_count_desired = bp->b_buffer_length; XB_TRACE(bp, "get", (unsigned long)flags); return bp; no_buffer: if (flags & (XBF_LOCK | XBF_TRYLOCK)) xfs_buf_unlock(bp); xfs_buf_rele(bp); return NULL;}xfs_buf_t *xfs_buf_read_flags( xfs_buftarg_t *target,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -