raid5.c

来自「linux 内核源代码」· C语言 代码 · 共 2,325 行 · 第 1/5 页

C
2,325
字号
/* * raid5.c : Multiple Devices driver for Linux *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman *	   Copyright (C) 1999, 2000 Ingo Molnar *	   Copyright (C) 2002, 2003 H. Peter Anvin * * RAID-4/5/6 management functions. * Thanks to Penguin Computing for making the RAID-6 development possible * by donating a test server! * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * BITMAP UNPLUGGING: * * The sequencing for updating the bitmap reliably is a little * subtle (and I got it wrong the first time) so it deserves some * explanation. * * We group bitmap updates into batches.  Each batch has a number. * We may write out several batches at once, but that isn't very important. * conf->bm_write is the number of the last batch successfully written. * conf->bm_flush is the number of the last batch that was closed to *    new additions. * When we discover that we will need to write to any block in a stripe * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq * the number of the batch it will be in. This is bm_flush+1. * When we are ready to do a write, if that batch hasn't been written yet, *   we plug the array and queue the stripe for later. * When an unplug happens, we increment bm_flush, thus closing the current *   batch. * When we notice that bm_flush > bm_write, we write out all pending updates * to the bitmap, and advance bm_write to where bm_flush was. * This may occasionally write a bit out twice, but is sure never to * miss any bits. */#include <linux/module.h>#include <linux/slab.h>#include <linux/highmem.h>#include <linux/bitops.h>#include <linux/kthread.h>#include <asm/atomic.h>#include "raid6.h"#include <linux/raid/bitmap.h>#include <linux/async_tx.h>/* * Stripe cache */#define NR_STRIPES		256#define STRIPE_SIZE		PAGE_SIZE#define STRIPE_SHIFT		(PAGE_SHIFT - 9)#define STRIPE_SECTORS		(STRIPE_SIZE>>9)#define	IO_THRESHOLD		1#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))#define HASH_MASK		(NR_HASH - 1)#define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))/* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap.  There may be several bio's per stripe+device, and * a bio could span several devices. * When walking this list for a particular stripe+device, we must never proceed * beyond a bio that extends past this device, as the next bio might no longer * be valid. * This macro is used to determine the 'next' bio in the list, given the sector * of the current stripe+device */#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)/* * The following can be used to debug the driver */#define RAID5_PARANOIA	1#if RAID5_PARANOIA && defined(CONFIG_SMP)# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)#else# define CHECK_DEVLOCK()#endif#ifdef DEBUG#define inline#define __inline__#endif#if !RAID6_USE_EMPTY_ZERO_PAGE/* In .bss so it's zeroed */const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));#endifstatic inline int raid6_next_disk(int disk, int raid_disks){	disk++;	return (disk < raid_disks) ? disk : 0;}static void return_io(struct bio *return_bi){	struct bio *bi = return_bi;	while (bi) {		return_bi = bi->bi_next;		bi->bi_next = NULL;		bi->bi_size = 0;		bi->bi_end_io(bi,			      test_bit(BIO_UPTODATE, &bi->bi_flags)			        ? 0 : -EIO);		bi = return_bi;	}}static void print_raid5_conf (raid5_conf_t *conf);static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh){	if (atomic_dec_and_test(&sh->count)) {		BUG_ON(!list_empty(&sh->lru));		BUG_ON(atomic_read(&conf->active_stripes)==0);		if (test_bit(STRIPE_HANDLE, &sh->state)) {			if (test_bit(STRIPE_DELAYED, &sh->state)) {				list_add_tail(&sh->lru, &conf->delayed_list);				blk_plug_device(conf->mddev->queue);			} else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&				   sh->bm_seq - conf->seq_write > 0) {				list_add_tail(&sh->lru, &conf->bitmap_list);				blk_plug_device(conf->mddev->queue);			} else {				clear_bit(STRIPE_BIT_DELAY, &sh->state);				list_add_tail(&sh->lru, &conf->handle_list);			}			md_wakeup_thread(conf->mddev->thread);		} else {			BUG_ON(sh->ops.pending);			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {				atomic_dec(&conf->preread_active_stripes);				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)					md_wakeup_thread(conf->mddev->thread);			}			atomic_dec(&conf->active_stripes);			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {				list_add_tail(&sh->lru, &conf->inactive_list);				wake_up(&conf->wait_for_stripe);				if (conf->retry_read_aligned)					md_wakeup_thread(conf->mddev->thread);			}		}	}}static void release_stripe(struct stripe_head *sh){	raid5_conf_t *conf = sh->raid_conf;	unsigned long flags;	spin_lock_irqsave(&conf->device_lock, flags);	__release_stripe(conf, sh);	spin_unlock_irqrestore(&conf->device_lock, flags);}static inline void remove_hash(struct stripe_head *sh){	pr_debug("remove_hash(), stripe %llu\n",		(unsigned long long)sh->sector);	hlist_del_init(&sh->hash);}static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh){	struct hlist_head *hp = stripe_hash(conf, sh->sector);	pr_debug("insert_hash(), stripe %llu\n",		(unsigned long long)sh->sector);	CHECK_DEVLOCK();	hlist_add_head(&sh->hash, hp);}/* find an idle stripe, make sure it is unhashed, and return it. */static struct stripe_head *get_free_stripe(raid5_conf_t *conf){	struct stripe_head *sh = NULL;	struct list_head *first;	CHECK_DEVLOCK();	if (list_empty(&conf->inactive_list))		goto out;	first = conf->inactive_list.next;	sh = list_entry(first, struct stripe_head, lru);	list_del_init(first);	remove_hash(sh);	atomic_inc(&conf->active_stripes);out:	return sh;}static void shrink_buffers(struct stripe_head *sh, int num){	struct page *p;	int i;	for (i=0; i<num ; i++) {		p = sh->dev[i].page;		if (!p)			continue;		sh->dev[i].page = NULL;		put_page(p);	}}static int grow_buffers(struct stripe_head *sh, int num){	int i;	for (i=0; i<num; i++) {		struct page *page;		if (!(page = alloc_page(GFP_KERNEL))) {			return 1;		}		sh->dev[i].page = page;	}	return 0;}static void raid5_build_block (struct stripe_head *sh, int i);static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks){	raid5_conf_t *conf = sh->raid_conf;	int i;	BUG_ON(atomic_read(&sh->count) != 0);	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));	BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);	CHECK_DEVLOCK();	pr_debug("init_stripe called, stripe %llu\n",		(unsigned long long)sh->sector);	remove_hash(sh);	sh->sector = sector;	sh->pd_idx = pd_idx;	sh->state = 0;	sh->disks = disks;	for (i = sh->disks; i--; ) {		struct r5dev *dev = &sh->dev[i];		if (dev->toread || dev->read || dev->towrite || dev->written ||		    test_bit(R5_LOCKED, &dev->flags)) {			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",			       (unsigned long long)sh->sector, i, dev->toread,			       dev->read, dev->towrite, dev->written,			       test_bit(R5_LOCKED, &dev->flags));			BUG();		}		dev->flags = 0;		raid5_build_block(sh, i);	}	insert_hash(conf, sh);}static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks){	struct stripe_head *sh;	struct hlist_node *hn;	CHECK_DEVLOCK();	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)		if (sh->sector == sector && sh->disks == disks)			return sh;	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);	return NULL;}static void unplug_slaves(mddev_t *mddev);static void raid5_unplug_device(struct request_queue *q);static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,					     int pd_idx, int noblock){	struct stripe_head *sh;	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);	spin_lock_irq(&conf->device_lock);	do {		wait_event_lock_irq(conf->wait_for_stripe,				    conf->quiesce == 0,				    conf->device_lock, /* nothing */);		sh = __find_stripe(conf, sector, disks);		if (!sh) {			if (!conf->inactive_blocked)				sh = get_free_stripe(conf);			if (noblock && sh == NULL)				break;			if (!sh) {				conf->inactive_blocked = 1;				wait_event_lock_irq(conf->wait_for_stripe,						    !list_empty(&conf->inactive_list) &&						    (atomic_read(&conf->active_stripes)						     < (conf->max_nr_stripes *3/4)						     || !conf->inactive_blocked),						    conf->device_lock,						    raid5_unplug_device(conf->mddev->queue)					);				conf->inactive_blocked = 0;			} else				init_stripe(sh, sector, pd_idx, disks);		} else {			if (atomic_read(&sh->count)) {			  BUG_ON(!list_empty(&sh->lru));			} else {				if (!test_bit(STRIPE_HANDLE, &sh->state))					atomic_inc(&conf->active_stripes);				if (list_empty(&sh->lru) &&				    !test_bit(STRIPE_EXPANDING, &sh->state))					BUG();				list_del_init(&sh->lru);			}		}	} while (sh == NULL);	if (sh)		atomic_inc(&sh->count);	spin_unlock_irq(&conf->device_lock);	return sh;}/* test_and_ack_op() ensures that we only dequeue an operation once */#define test_and_ack_op(op, pend) \do {							\	if (test_bit(op, &sh->ops.pending) &&		\		!test_bit(op, &sh->ops.complete)) {	\		if (test_and_set_bit(op, &sh->ops.ack)) \			clear_bit(op, &pend);		\		else					\			ack++;				\	} else						\		clear_bit(op, &pend);			\} while (0)/* find new work to run, do not resubmit work that is already * in flight */static unsigned long get_stripe_work(struct stripe_head *sh){	unsigned long pending;	int ack = 0;	pending = sh->ops.pending;	test_and_ack_op(STRIPE_OP_BIOFILL, pending);	test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);	test_and_ack_op(STRIPE_OP_PREXOR, pending);	test_and_ack_op(STRIPE_OP_BIODRAIN, pending);	test_and_ack_op(STRIPE_OP_POSTXOR, pending);	test_and_ack_op(STRIPE_OP_CHECK, pending);	if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))		ack++;	sh->ops.count -= ack;	if (unlikely(sh->ops.count < 0)) {		printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "			"ops.complete: %#lx\n", pending, sh->ops.pending,			sh->ops.ack, sh->ops.complete);		BUG();	}	return pending;}static voidraid5_end_read_request(struct bio *bi, int error);static voidraid5_end_write_request(struct bio *bi, int error);static void ops_run_io(struct stripe_head *sh){	raid5_conf_t *conf = sh->raid_conf;	int i, disks = sh->disks;	might_sleep();	for (i = disks; i--; ) {		int rw;		struct bio *bi;		mdk_rdev_t *rdev;		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))			rw = WRITE;		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))			rw = READ;		else			continue;		bi = &sh->dev[i].req;		bi->bi_rw = rw;		if (rw == WRITE)			bi->bi_end_io = raid5_end_write_request;		else			bi->bi_end_io = raid5_end_read_request;		rcu_read_lock();		rdev = rcu_dereference(conf->disks[i].rdev);		if (rdev && test_bit(Faulty, &rdev->flags))			rdev = NULL;		if (rdev)			atomic_inc(&rdev->nr_pending);		rcu_read_unlock();		if (rdev) {			if (test_bit(STRIPE_SYNCING, &sh->state) ||				test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||				test_bit(STRIPE_EXPAND_READY, &sh->state))				md_sync_acct(rdev->bdev, STRIPE_SECTORS);			bi->bi_bdev = rdev->bdev;			pr_debug("%s: for %llu schedule op %ld on disc %d\n",				__FUNCTION__, (unsigned long long)sh->sector,				bi->bi_rw, i);			atomic_inc(&sh->count);			bi->bi_sector = sh->sector + rdev->data_offset;			bi->bi_flags = 1 << BIO_UPTODATE;			bi->bi_vcnt = 1;			bi->bi_max_vecs = 1;			bi->bi_idx = 0;			bi->bi_io_vec = &sh->dev[i].vec;			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;			bi->bi_io_vec[0].bv_offset = 0;			bi->bi_size = STRIPE_SIZE;			bi->bi_next = NULL;			if (rw == WRITE &&			    test_bit(R5_ReWrite, &sh->dev[i].flags))				atomic_add(STRIPE_SECTORS,					&rdev->corrected_errors);			generic_make_request(bi);		} else {			if (rw == WRITE)				set_bit(STRIPE_DEGRADED, &sh->state);			pr_debug("skip op %ld on disc %d for sector %llu\n",				bi->bi_rw, i, (unsigned long long)sh->sector);			clear_bit(R5_LOCKED, &sh->dev[i].flags);			set_bit(STRIPE_HANDLE, &sh->state);		}	}}static struct dma_async_tx_descriptor *

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?