📄 xfs_buf.c
字号:
/* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */#include <linux/stddef.h>#include <linux/errno.h>#include <linux/slab.h>#include <linux/pagemap.h>#include <linux/init.h>#include <linux/vmalloc.h>#include <linux/bio.h>#include <linux/sysctl.h>#include <linux/proc_fs.h>#include <linux/workqueue.h>#include <linux/percpu.h>#include <linux/blkdev.h>#include <linux/hash.h>#include <linux/kthread.h>#include "xfs_linux.h"STATIC kmem_cache_t *pagebuf_zone;STATIC kmem_shaker_t pagebuf_shake;STATIC int xfsbufd_wakeup(int, gfp_t);STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);STATIC struct workqueue_struct *xfslogd_workqueue;struct workqueue_struct *xfsdatad_workqueue;#ifdef PAGEBUF_TRACEvoidpagebuf_trace( xfs_buf_t *pb, char *id, void *data, void *ra){ ktrace_enter(pagebuf_trace_buf, pb, id, (void *)(unsigned long)pb->pb_flags, (void *)(unsigned long)pb->pb_hold.counter, (void *)(unsigned long)pb->pb_sema.count.counter, (void *)current, data, ra, (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff), (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff), (void *)(unsigned long)pb->pb_buffer_length, NULL, NULL, NULL, NULL, NULL);}ktrace_t *pagebuf_trace_buf;#define PAGEBUF_TRACE_SIZE 4096#define PB_TRACE(pb, id, data) \ pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))#else#define PB_TRACE(pb, id, data) do { } while (0)#endif#ifdef PAGEBUF_LOCK_TRACKING# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)#else# define PB_SET_OWNER(pb) do { } while (0)# define PB_CLEAR_OWNER(pb) do { } while (0)# define PB_GET_OWNER(pb) do { } while (0)#endif#define pb_to_gfp(flags) \ ((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \ ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)#define pb_to_km(flags) \ (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)#define pagebuf_allocate(flags) \ kmem_zone_alloc(pagebuf_zone, pb_to_km(flags))#define pagebuf_deallocate(pb) \ kmem_zone_free(pagebuf_zone, (pb));/* * Page Region interfaces. * * For pages in filesystems where the blocksize is smaller than the * pagesize, we use the page->private field (long) to hold a bitmap * of uptodate regions within the page. * * Each such region is "bytes per page / bits per long" bytes long. * * NBPPR == number-of-bytes-per-page-region * BTOPR == bytes-to-page-region (rounded up) * BTOPRT == bytes-to-page-region-truncated (rounded down) */#if (BITS_PER_LONG == 32)#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */#elif (BITS_PER_LONG == 64)#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */#else#error BITS_PER_LONG must be 32 or 64#endif#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))STATIC unsigned longpage_region_mask( size_t offset, size_t length){ unsigned long mask; int first, final; first = BTOPR(offset); final = BTOPRT(offset + length - 1); first = min(first, final); mask = ~0UL; mask <<= BITS_PER_LONG - (final - first); mask >>= BITS_PER_LONG - (final); ASSERT(offset + length <= PAGE_CACHE_SIZE); ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); return mask;}STATIC inline voidset_page_region( struct page *page, size_t offset, size_t length){ set_page_private(page, page_private(page) | page_region_mask(offset, length)); if (page_private(page) == ~0UL) SetPageUptodate(page);}STATIC inline inttest_page_region( struct page *page, size_t offset, size_t length){ unsigned long mask = page_region_mask(offset, length); return (mask && (page_private(page) & mask) == mask);}/* * Mapping of multi-page buffers into contiguous virtual space */typedef struct a_list { void *vm_addr; struct a_list *next;} a_list_t;STATIC a_list_t *as_free_head;STATIC int as_list_len;STATIC DEFINE_SPINLOCK(as_lock);/* * Try to batch vunmaps because they are costly. */STATIC voidfree_address( void *addr){ a_list_t *aentry; aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); if (likely(aentry)) { spin_lock(&as_lock); aentry->next = as_free_head; aentry->vm_addr = addr; as_free_head = aentry; as_list_len++; spin_unlock(&as_lock); } else { vunmap(addr); }}STATIC voidpurge_addresses(void){ a_list_t *aentry, *old; if (as_free_head == NULL) return; spin_lock(&as_lock); aentry = as_free_head; as_free_head = NULL; as_list_len = 0; spin_unlock(&as_lock); while ((old = aentry) != NULL) { vunmap(aentry->vm_addr); aentry = aentry->next; kfree(old); }}/* * Internal pagebuf object manipulation */STATIC void_pagebuf_initialize( xfs_buf_t *pb, xfs_buftarg_t *target, loff_t range_base, size_t range_length, page_buf_flags_t flags){ /* * We don't want certain flags to appear in pb->pb_flags. */ flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD); memset(pb, 0, sizeof(xfs_buf_t)); atomic_set(&pb->pb_hold, 1); init_MUTEX_LOCKED(&pb->pb_iodonesema); INIT_LIST_HEAD(&pb->pb_list); INIT_LIST_HEAD(&pb->pb_hash_list); init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */ PB_SET_OWNER(pb); pb->pb_target = target; pb->pb_file_offset = range_base; /* * Set buffer_length and count_desired to the same value initially. * I/O routines should use count_desired, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ pb->pb_buffer_length = pb->pb_count_desired = range_length; pb->pb_flags = flags; pb->pb_bn = XFS_BUF_DADDR_NULL; atomic_set(&pb->pb_pin_count, 0); init_waitqueue_head(&pb->pb_waiters); XFS_STATS_INC(pb_create); PB_TRACE(pb, "initialize", target);}/* * Allocate a page array capable of holding a specified number * of pages, and point the page buf at it. */STATIC int_pagebuf_get_pages( xfs_buf_t *pb, int page_count, page_buf_flags_t flags){ /* Make sure that we have a page list */ if (pb->pb_pages == NULL) { pb->pb_offset = page_buf_poff(pb->pb_file_offset); pb->pb_page_count = page_count; if (page_count <= PB_PAGES) { pb->pb_pages = pb->pb_page_array; } else { pb->pb_pages = kmem_alloc(sizeof(struct page *) * page_count, pb_to_km(flags)); if (pb->pb_pages == NULL) return -ENOMEM; } memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); } return 0;}/* * Frees pb_pages if it was malloced. */STATIC void_pagebuf_free_pages( xfs_buf_t *bp){ if (bp->pb_pages != bp->pb_page_array) { kmem_free(bp->pb_pages, bp->pb_page_count * sizeof(struct page *)); }}/* * Releases the specified buffer. * * The modification state of any associated pages is left unchanged. * The buffer most not be on any hash - use pagebuf_rele instead for * hashed and refcounted buffers */voidpagebuf_free( xfs_buf_t *bp){ PB_TRACE(bp, "free", 0); ASSERT(list_empty(&bp->pb_hash_list)); if (bp->pb_flags & _PBF_PAGE_CACHE) { uint i; if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1)) free_address(bp->pb_addr - bp->pb_offset); for (i = 0; i < bp->pb_page_count; i++) page_cache_release(bp->pb_pages[i]); _pagebuf_free_pages(bp); } else if (bp->pb_flags & _PBF_KMEM_ALLOC) { /* * XXX(hch): bp->pb_count_desired might be incorrect (see * pagebuf_associate_memory for details), but fortunately * the Linux version of kmem_free ignores the len argument.. */ kmem_free(bp->pb_addr, bp->pb_count_desired); _pagebuf_free_pages(bp); } pagebuf_deallocate(bp);}/* * Finds all pages for buffer in question and builds it's page list. */STATIC int_pagebuf_lookup_pages( xfs_buf_t *bp, uint flags){ struct address_space *mapping = bp->pb_target->pbr_mapping; size_t blocksize = bp->pb_target->pbr_bsize; size_t size = bp->pb_count_desired; size_t nbytes, offset; gfp_t gfp_mask = pb_to_gfp(flags); unsigned short page_count, i; pgoff_t first; loff_t end; int error; end = bp->pb_file_offset + bp->pb_buffer_length; page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset); error = _pagebuf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; bp->pb_flags |= _PBF_PAGE_CACHE; offset = bp->pb_offset; first = bp->pb_file_offset >> PAGE_CACHE_SHIFT; for (i = 0; i < bp->pb_page_count; i++) { struct page *page; uint retries = 0; retry: page = find_or_create_page(mapping, first + i, gfp_mask); if (unlikely(page == NULL)) { if (flags & PBF_READ_AHEAD) { bp->pb_page_count = i; for (i = 0; i < bp->pb_page_count; i++) unlock_page(bp->pb_pages[i]); return -ENOMEM; } /* * This could deadlock. * * But until all the XFS lowlevel code is revamped to * handle buffer allocation failures we can't do much. */ if (!(++retries % 100)) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", __FUNCTION__, gfp_mask); XFS_STATS_INC(pb_page_retries); xfsbufd_wakeup(0, gfp_mask); blk_congestion_wait(WRITE, HZ/50); goto retry; } XFS_STATS_INC(pb_page_found); nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); size -= nbytes; if (!PageUptodate(page)) { page_count--; if (blocksize >= PAGE_CACHE_SIZE) { if (flags & PBF_READ) bp->pb_locked = 1; } else if (!PagePrivate(page)) { if (test_page_region(page, offset, nbytes)) page_count++; } } bp->pb_pages[i] = page; offset = 0; } if (!bp->pb_locked) { for (i = 0; i < bp->pb_page_count; i++) unlock_page(bp->pb_pages[i]); } if (page_count == bp->pb_page_count) bp->pb_flags |= PBF_DONE; PB_TRACE(bp, "lookup_pages", (long)page_count); return error;}/* * Map buffer into kernel address-space if nessecary. */STATIC int_pagebuf_map_pages( xfs_buf_t *bp, uint flags){ /* A single page buffer is always mappable */ if (bp->pb_page_count == 1) { bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset; bp->pb_flags |= PBF_MAPPED; } else if (flags & PBF_MAPPED) { if (as_list_len > 64) purge_addresses(); bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count, VM_MAP, PAGE_KERNEL); if (unlikely(bp->pb_addr == NULL)) return -ENOMEM; bp->pb_addr += bp->pb_offset; bp->pb_flags |= PBF_MAPPED; } return 0;}/* * Finding and Reading Buffers *//* * _pagebuf_find * * Looks up, and creates if absent, a lockable buffer for * a given range of an inode. The buffer is returned * locked. If other overlapping buffers exist, they are * released before the new buffer is created and locked, * which may imply that this call will block until those buffers * are unlocked. No I/O is implied by this call. */xfs_buf_t *_pagebuf_find( xfs_buftarg_t *btp, /* block device target */ loff_t ioff, /* starting offset of range */ size_t isize, /* length of range */ page_buf_flags_t flags, /* PBF_TRYLOCK */ xfs_buf_t *new_pb)/* newly allocated buffer */{ loff_t range_base; size_t range_length; xfs_bufhash_t *hash; xfs_buf_t *pb, *n; range_base = (ioff << BBSHIFT); range_length = (isize << BBSHIFT); /* Check for IOs smaller than the sector size / not sector aligned */ ASSERT(!(range_length < (1 << btp->pbr_sshift))); ASSERT(!(range_base & (loff_t)btp->pbr_smask)); hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; spin_lock(&hash->bh_lock); list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) { ASSERT(btp == pb->pb_target); if (pb->pb_file_offset == range_base && pb->pb_buffer_length == range_length) { /* * If we look at something bring it to the * front of the list for next time. */ atomic_inc(&pb->pb_hold); list_move(&pb->pb_hash_list, &hash->bh_list); goto found; } } /* No match found */ if (new_pb) { _pagebuf_initialize(new_pb, btp, range_base, range_length, flags); new_pb->pb_hash = hash; list_add(&new_pb->pb_hash_list, &hash->bh_list); } else { XFS_STATS_INC(pb_miss_locked); } spin_unlock(&hash->bh_lock); return new_pb;found: spin_unlock(&hash->bh_lock); /* Attempt to get the semaphore without sleeping, * if this does not work then we need to drop the * spinlock and do a hard attempt on the semaphore. */ if (down_trylock(&pb->pb_sema)) { if (!(flags & PBF_TRYLOCK)) { /* wait for buffer ownership */ PB_TRACE(pb, "get_lock", 0); pagebuf_lock(pb); XFS_STATS_INC(pb_get_locked_waited); } else { /* We asked for a trylock and failed, no need * to look at file offset and length here, we * know that this pagebuf at least overlaps our * pagebuf and is locked, therefore our buffer * either does not exist, or is this buffer */ pagebuf_rele(pb); XFS_STATS_INC(pb_busy_locked); return (NULL); } } else { /* trylock worked */ PB_SET_OWNER(pb); } if (pb->pb_flags & PBF_STALE) { ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0); pb->pb_flags &= PBF_MAPPED; } PB_TRACE(pb, "got_lock", 0); XFS_STATS_INC(pb_get_locked); return (pb);}/* * xfs_buf_get_flags assembles a buffer covering the specified range. * * Storage in memory for all portions of the buffer will be allocated, * although backing storage may not be. */xfs_buf_t *xfs_buf_get_flags( /* allocate a buffer */ xfs_buftarg_t *target,/* target for buffer */ loff_t ioff, /* starting offset of range */ size_t isize, /* length of range */ page_buf_flags_t flags) /* PBF_TRYLOCK */{ xfs_buf_t *pb, *new_pb; int error = 0, i; new_pb = pagebuf_allocate(flags); if (unlikely(!new_pb)) return NULL; pb = _pagebuf_find(target, ioff, isize, flags, new_pb); if (pb == new_pb) { error = _pagebuf_lookup_pages(pb, flags); if (error) goto no_buffer; } else { pagebuf_deallocate(new_pb); if (unlikely(pb == NULL)) return NULL; } for (i = 0; i < pb->pb_page_count; i++) mark_page_accessed(pb->pb_pages[i]); if (!(pb->pb_flags & PBF_MAPPED)) { error = _pagebuf_map_pages(pb, flags); if (unlikely(error)) { printk(KERN_WARNING "%s: failed to map pages\n", __FUNCTION__); goto no_buffer; } } XFS_STATS_INC(pb_get); /* * Always fill in the block number now, the mapped cases can do * their own overlay of this later. */ pb->pb_bn = ioff; pb->pb_count_desired = pb->pb_buffer_length; PB_TRACE(pb, "get", (unsigned long)flags); return pb; no_buffer: if (flags & (PBF_LOCK | PBF_TRYLOCK)) pagebuf_unlock(pb); pagebuf_rele(pb); return NULL;}xfs_buf_t *xfs_buf_read_flags( xfs_buftarg_t *target, loff_t ioff, size_t isize, page_buf_flags_t flags){ xfs_buf_t *pb; flags |= PBF_READ; pb = xfs_buf_get_flags(target, ioff, isize, flags); if (pb) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -