📄 xfs_buf.c
字号:
/* * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * Further, this software is distributed without any warranty that it is * free of the rightful claim of any third person regarding infringement * or the like. Any license provided herein, whether implied or * otherwise, applies only to this software file. Patent licenses, if * any, provided herein do not apply to combinations of this program with * other software, or any other product whatsoever. * * You should have received a copy of the GNU General Public License along * with this program; if not, write the Free Software Foundation, Inc., 59 * Temple Place - Suite 330, Boston MA 02111-1307, USA. * * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, * Mountain View, CA 94043, or: * * http://www.sgi.com * * For further information regarding this notice, see: * * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ *//* * The xfs_buf.c code provides an abstract buffer cache model on top * of the Linux page cache. Cached metadata blocks for a file system * are hashed to the inode for the block device. xfs_buf.c assembles * buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O. * * Written by Steve Lord, Jim Mostek, Russell Cattelan * and Rajagopal Ananthanarayanan ("ananth") at SGI. * */#include <linux/stddef.h>#include <linux/errno.h>#include <linux/slab.h>#include <linux/pagemap.h>#include <linux/init.h>#include <linux/vmalloc.h>#include <linux/blkdev.h>#include <linux/locks.h>#include <linux/sysctl.h>#include <linux/proc_fs.h>#include "xfs_linux.h"#define BN_ALIGN_MASK ((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1)#ifndef GFP_READAHEAD#define GFP_READAHEAD 0#endif/* * A backport of the 2.5 scheduler is used by many vendors of 2.4-based * distributions. * We can only guess it's presences by the lack of the SCHED_YIELD flag. * If the heuristic doesn't work, change this define by hand. */#ifndef SCHED_YIELD#define __HAVE_NEW_SCHEDULER 1#endif/* * cpumask_t is used for supporting NR_CPUS > BITS_PER_LONG. * If support for this is present, migrate_to_cpu exists and provides * a wrapper around the set_cpus_allowed routine. */#ifdef copy_cpumask#define __HAVE_CPUMASK_T 1#endif#ifndef __HAVE_CPUMASK_T# ifndef __HAVE_NEW_SCHEDULER# define migrate_to_cpu(cpu) \ do { current->cpus_allowed = 1UL << (cpu); } while (0)# else# define migrate_to_cpu(cpu) \ set_cpus_allowed(current, 1UL << (cpu))# endif#endif#ifndef VM_MAP#define VM_MAP VM_ALLOC#endif/* * File wide globals */STATIC kmem_cache_t *pagebuf_cache;STATIC kmem_shaker_t pagebuf_shake;#define MAX_IO_DAEMONS NR_CPUS#define CPU_TO_DAEMON(cpu) (cpu)STATIC int pb_logio_daemons[MAX_IO_DAEMONS];STATIC struct list_head pagebuf_logiodone_tq[MAX_IO_DAEMONS];STATIC wait_queue_head_t pagebuf_logiodone_wait[MAX_IO_DAEMONS];STATIC int pb_dataio_daemons[MAX_IO_DAEMONS];STATIC struct list_head pagebuf_dataiodone_tq[MAX_IO_DAEMONS];STATIC wait_queue_head_t pagebuf_dataiodone_wait[MAX_IO_DAEMONS];/* * For pre-allocated buffer head pool */#define NR_RESERVED_BH 64static wait_queue_head_t pb_resv_bh_wait;static spinlock_t pb_resv_bh_lock = SPIN_LOCK_UNLOCKED;struct buffer_head *pb_resv_bh = NULL; /* list of bh */int pb_resv_bh_cnt = 0; /* # of bh available */STATIC void _pagebuf_ioapply(xfs_buf_t *);STATIC int pagebuf_daemon_wakeup(int, unsigned int);STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);STATIC void pagebuf_runall_queues(struct list_head[]);/* * Pagebuf debugging */#ifdef PAGEBUF_TRACEvoidpagebuf_trace( xfs_buf_t *pb, char *id, void *data, void *ra){ ktrace_enter(pagebuf_trace_buf, pb, id, (void *)(unsigned long)pb->pb_flags, (void *)(unsigned long)pb->pb_hold.counter, (void *)(unsigned long)pb->pb_sema.count.counter, (void *)current, data, ra, (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff), (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff), (void *)(unsigned long)pb->pb_buffer_length, NULL, NULL, NULL, NULL, NULL);}ktrace_t *pagebuf_trace_buf;#define PAGEBUF_TRACE_SIZE 4096#define PB_TRACE(pb, id, data) \ pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))#else#define PB_TRACE(pb, id, data) do { } while (0)#endif#ifdef PAGEBUF_LOCK_TRACKING# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)# define PB_GET_OWNER(pb) ((pb)->pb_last_holder)#else# define PB_SET_OWNER(pb) do { } while (0)# define PB_CLEAR_OWNER(pb) do { } while (0)# define PB_GET_OWNER(pb) do { } while (0)#endif/* * Pagebuf allocation / freeing. */#define pb_to_gfp(flags) \ (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \ ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)#define pb_to_km(flags) \ (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)#define pagebuf_allocate(flags) \ kmem_zone_alloc(pagebuf_cache, pb_to_km(flags))#define pagebuf_deallocate(pb) \ kmem_zone_free(pagebuf_cache, (pb));/* * Pagebuf hashing */#define NBITS 8#define NHASH (1<<NBITS)typedef struct { struct list_head pb_hash; spinlock_t pb_hash_lock;} pb_hash_t;STATIC pb_hash_t pbhash[NHASH];#define pb_hash(pb) &pbhash[pb->pb_hash_index]STATIC int_bhash( struct block_device *bdev, loff_t base){ int bit, hval; base >>= 9; base ^= (unsigned long)bdev / L1_CACHE_BYTES; for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) { hval ^= (int)base & (NHASH-1); base >>= NBITS; } return hval;}/* * Mapping of multi-page buffers into contiguous virtual space */typedef struct a_list { void *vm_addr; struct a_list *next;} a_list_t;STATIC a_list_t *as_free_head;STATIC int as_list_len;STATIC spinlock_t as_lock = SPIN_LOCK_UNLOCKED;/* * Try to batch vunmaps because they are costly. */STATIC voidfree_address( void *addr){ a_list_t *aentry; aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC); if (aentry) { spin_lock(&as_lock); aentry->next = as_free_head; aentry->vm_addr = addr; as_free_head = aentry; as_list_len++; spin_unlock(&as_lock); } else { vunmap(addr); }}STATIC voidpurge_addresses(void){ a_list_t *aentry, *old; if (as_free_head == NULL) return; spin_lock(&as_lock); aentry = as_free_head; as_free_head = NULL; as_list_len = 0; spin_unlock(&as_lock); while ((old = aentry) != NULL) { vunmap(aentry->vm_addr); aentry = aentry->next; kfree(old); }}/* * Internal pagebuf object manipulation */STATIC void_pagebuf_initialize( xfs_buf_t *pb, xfs_buftarg_t *target, loff_t range_base, size_t range_length, page_buf_flags_t flags){ /* * We don't want certain flags to appear in pb->pb_flags. */ flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD); memset(pb, 0, sizeof(xfs_buf_t)); atomic_set(&pb->pb_hold, 1); init_MUTEX_LOCKED(&pb->pb_iodonesema); INIT_LIST_HEAD(&pb->pb_list); INIT_LIST_HEAD(&pb->pb_hash_list); init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */ PB_SET_OWNER(pb); pb->pb_target = target; pb->pb_file_offset = range_base; /* * Set buffer_length and count_desired to the same value initially. * I/O routines should use count_desired, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ pb->pb_buffer_length = pb->pb_count_desired = range_length; pb->pb_flags = flags | PBF_NONE; pb->pb_bn = XFS_BUF_DADDR_NULL; atomic_set(&pb->pb_pin_count, 0); init_waitqueue_head(&pb->pb_waiters); XFS_STATS_INC(pb_create); PB_TRACE(pb, "initialize", target);}/* * Allocate a page array capable of holding a specified number * of pages, and point the page buf at it. */STATIC int_pagebuf_get_pages( xfs_buf_t *pb, int page_count, page_buf_flags_t flags){ /* Make sure that we have a page list */ if (pb->pb_pages == NULL) { pb->pb_offset = page_buf_poff(pb->pb_file_offset); pb->pb_page_count = page_count; if (page_count <= PB_PAGES) { pb->pb_pages = pb->pb_page_array; } else { pb->pb_pages = kmem_alloc(sizeof(struct page *) * page_count, pb_to_km(flags)); if (pb->pb_pages == NULL) return -ENOMEM; } memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); } return 0;}/* * Frees pb_pages if it was malloced. */STATIC void_pagebuf_free_pages( xfs_buf_t *bp){ if (bp->pb_pages != bp->pb_page_array) { kmem_free(bp->pb_pages, bp->pb_page_count * sizeof(struct page *)); }}/* * Releases the specified buffer. * * The modification state of any associated pages is left unchanged. * The buffer most not be on any hash - use pagebuf_rele instead for * hashed and refcounted buffers */voidpagebuf_free( xfs_buf_t *bp){ PB_TRACE(bp, "free", 0); ASSERT(list_empty(&bp->pb_hash_list)); if (bp->pb_flags & _PBF_PAGE_CACHE) { uint i; if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1)) free_address(bp->pb_addr - bp->pb_offset); for (i = 0; i < bp->pb_page_count; i++) page_cache_release(bp->pb_pages[i]); _pagebuf_free_pages(bp); } else if (bp->pb_flags & _PBF_KMEM_ALLOC) { /* * XXX(hch): bp->pb_count_desired might be incorrect (see * pagebuf_associate_memory for details), but fortunately * the Linux version of kmem_free ignores the len argument.. */ kmem_free(bp->pb_addr, bp->pb_count_desired); _pagebuf_free_pages(bp); } pagebuf_deallocate(bp);}/* * Finds all pages for buffer in question and builds it's page list. */STATIC int_pagebuf_lookup_pages( xfs_buf_t *bp, uint flags){ struct address_space *mapping = bp->pb_target->pbr_mapping; size_t blocksize = bp->pb_target->pbr_bsize; int gfp_mask = pb_to_gfp(flags); unsigned short page_count, i; pgoff_t first; loff_t end; int error; end = bp->pb_file_offset + bp->pb_buffer_length; page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset); error = _pagebuf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; bp->pb_flags |= _PBF_PAGE_CACHE; first = bp->pb_file_offset >> PAGE_CACHE_SHIFT; for (i = 0; i < bp->pb_page_count; i++) { struct page *page; uint retries = 0; retry: page = find_or_create_page(mapping, first + i, gfp_mask); if (unlikely(page == NULL)) { if (flags & PBF_READ_AHEAD) { bp->pb_page_count = i; for (i = 0; i < bp->pb_page_count; i++) unlock_page(bp->pb_pages[i]); return -ENOMEM; } /* * This could deadlock. * * But until all the XFS lowlevel code is revamped to * handle buffer allocation failures we can't do much. */ if (!(++retries % 100)) printk(KERN_ERR "possible deadlock in %s (mode:0x%x)\n", __FUNCTION__, gfp_mask); XFS_STATS_INC(pb_page_retries); pagebuf_daemon_wakeup(0, gfp_mask); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(10); goto retry; } XFS_STATS_INC(pb_page_found); /* if we need to do I/O on a page record the fact */ if (!Page_Uptodate(page)) { page_count--; if (blocksize == PAGE_CACHE_SIZE && (flags & PBF_READ)) bp->pb_locked = 1; } bp->pb_pages[i] = page; } if (!bp->pb_locked) { for (i = 0; i < bp->pb_page_count; i++) unlock_page(bp->pb_pages[i]); } if (page_count) { /* if we have any uptodate pages, mark that in the buffer */ bp->pb_flags &= ~PBF_NONE; /* if some pages aren't uptodate, mark that in the buffer */ if (page_count != bp->pb_page_count) bp->pb_flags |= PBF_PARTIAL; } PB_TRACE(bp, "lookup_pages", (long)page_count); return error;}/* * Map buffer into kernel address-space if nessecary. */STATIC int_pagebuf_map_pages( xfs_buf_t *bp, uint flags){ /* A single page buffer is always mappable */ if (bp->pb_page_count == 1) { bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset; bp->pb_flags |= PBF_MAPPED; } else if (flags & PBF_MAPPED) { if (as_list_len > 64) purge_addresses(); bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count, VM_MAP, PAGE_KERNEL); if (unlikely(bp->pb_addr == NULL)) return -ENOMEM; bp->pb_addr += bp->pb_offset; bp->pb_flags |= PBF_MAPPED; } return 0;}/* * Pre-allocation of a pool of buffer heads for use in * low-memory situations. *//* * _pagebuf_prealloc_bh * * Pre-allocate a pool of "count" buffer heads at startup. * Puts them on a list at "pb_resv_bh" * Returns number of bh actually allocated to pool. */STATIC int_pagebuf_prealloc_bh( int count){ struct buffer_head *bh; int i; for (i = 0; i < count; i++) { bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); if (!bh) break; bh->b_pprev = &pb_resv_bh; bh->b_next = pb_resv_bh; pb_resv_bh = bh; pb_resv_bh_cnt++; } return i;}/* * _pagebuf_get_prealloc_bh * * Get one buffer head from our pre-allocated pool. * If pool is empty, sleep 'til one comes back in. * Returns aforementioned buffer head. */STATIC struct buffer_head *_pagebuf_get_prealloc_bh(void){ unsigned long flags; struct buffer_head *bh; DECLARE_WAITQUEUE (wait, current); spin_lock_irqsave(&pb_resv_bh_lock, flags); if (pb_resv_bh_cnt < 1) { add_wait_queue(&pb_resv_bh_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock_irqrestore(&pb_resv_bh_lock, flags); run_task_queue(&tq_disk); schedule(); spin_lock_irqsave(&pb_resv_bh_lock, flags); } while (pb_resv_bh_cnt < 1); __set_current_state(TASK_RUNNING); remove_wait_queue(&pb_resv_bh_wait, &wait); } BUG_ON(pb_resv_bh_cnt < 1); BUG_ON(!pb_resv_bh); bh = pb_resv_bh; pb_resv_bh = bh->b_next; pb_resv_bh_cnt--; spin_unlock_irqrestore(&pb_resv_bh_lock, flags); return bh;}/* * _pagebuf_free_bh * * Take care of buffer heads that we're finished with. * Call this instead of just kmem_cache_free(bh_cachep, bh) * when you're done with a bh. * * If our pre-allocated pool is full, just free the buffer head. * Otherwise, put it back in the pool, and wake up anybody * waiting for one. */STATIC inline void_pagebuf_free_bh( struct buffer_head *bh){ unsigned long flags; int free;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -