raid6main.c

来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 2,095 行 · 第 1/4 页

C
2,095
字号
	 * need to be failed	 */	if (failed > 2 && to_read+to_write+written) {		spin_lock_irq(&conf->device_lock);		for (i=disks; i--; ) {			/* fail all writes first */			bi = sh->dev[i].towrite;			sh->dev[i].towrite = NULL;			if (bi) to_write--;			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);				clear_bit(BIO_UPTODATE, &bi->bi_flags);				if (--bi->bi_phys_segments == 0) {					md_write_end(conf->mddev);					bi->bi_next = return_bi;					return_bi = bi;				}				bi = nextbi;			}			/* and fail all 'written' */			bi = sh->dev[i].written;			sh->dev[i].written = NULL;			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);				clear_bit(BIO_UPTODATE, &bi->bi_flags);				if (--bi->bi_phys_segments == 0) {					md_write_end(conf->mddev);					bi->bi_next = return_bi;					return_bi = bi;				}				bi = bi2;			}			/* fail any reads if this device is non-operational */			if (!test_bit(R5_Insync, &sh->dev[i].flags)) {				bi = sh->dev[i].toread;				sh->dev[i].toread = NULL;				if (bi) to_read--;				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);					clear_bit(BIO_UPTODATE, &bi->bi_flags);					if (--bi->bi_phys_segments == 0) {						bi->bi_next = return_bi;						return_bi = bi;					}					bi = nextbi;				}			}		}		spin_unlock_irq(&conf->device_lock);	}	if (failed > 2 && syncing) {		md_done_sync(conf->mddev, STRIPE_SECTORS,0);		clear_bit(STRIPE_SYNCING, &sh->state);		syncing = 0;	}	/*	 * might be able to return some write requests if the parity blocks	 * are safe, or on a failed drive	 */	pdev = &sh->dev[pd_idx];	p_failed = (failed >= 1 && failed_num[0] == pd_idx)		|| (failed >= 2 && failed_num[1] == pd_idx);	qdev = &sh->dev[qd_idx];	q_failed = (failed >= 1 && failed_num[0] == qd_idx)		|| (failed >= 2 && failed_num[1] == qd_idx);	if ( written &&	     ( p_failed || ((test_bit(R5_Insync, &pdev->flags)			     && !test_bit(R5_LOCKED, &pdev->flags)			     && test_bit(R5_UPTODATE, &pdev->flags))) ) &&	     ( q_failed || ((test_bit(R5_Insync, &qdev->flags)			     && !test_bit(R5_LOCKED, &qdev->flags)			     && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {		/* any written block on an uptodate or failed drive can be		 * returned.  Note that if we 'wrote' to a failed drive,		 * it will be UPTODATE, but never LOCKED, so we don't need		 * to test 'failed' directly.		 */		for (i=disks; i--; )			if (sh->dev[i].written) {				dev = &sh->dev[i];				if (!test_bit(R5_LOCKED, &dev->flags) &&				    test_bit(R5_UPTODATE, &dev->flags) ) {					/* We can return any write requests */					struct bio *wbi, *wbi2;					PRINTK("Return write for stripe %llu disc %d\n",					       (unsigned long long)sh->sector, i);					spin_lock_irq(&conf->device_lock);					wbi = dev->written;					dev->written = NULL;					while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {						wbi2 = r5_next_bio(wbi, dev->sector);						if (--wbi->bi_phys_segments == 0) {							md_write_end(conf->mddev);							wbi->bi_next = return_bi;							return_bi = wbi;						}						wbi = wbi2;					}					spin_unlock_irq(&conf->device_lock);				}			}	}	/* Now we might consider reading some blocks, either to check/generate	 * parity, or to satisfy requests	 * or to load a block that is being partially written.	 */	if (to_read || non_overwrite || (syncing && (uptodate < disks))) {		for (i=disks; i--;) {			dev = &sh->dev[i];			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&			    (dev->toread ||			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||			     syncing ||			     (failed >= 1 && (sh->dev[failed_num[0]].toread ||					 (sh->dev[failed_num[0]].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num[0]].flags)))) ||			     (failed >= 2 && (sh->dev[failed_num[1]].toread ||					 (sh->dev[failed_num[1]].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num[1]].flags))))				    )				) {				/* we would like to get this block, possibly				 * by computing it, but we might not be able to				 */				if (uptodate == disks-1) {					PRINTK("Computing stripe %llu block %d\n",					       (unsigned long long)sh->sector, i);					compute_block_1(sh, i);					uptodate++;				} else if ( uptodate == disks-2 && failed >= 2 ) {					/* Computing 2-failure is *very* expensive; only do it if failed >= 2 */					int other;					for (other=disks; other--;) {						if ( other == i )							continue;						if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )							break;					}					BUG_ON(other < 0);					PRINTK("Computing stripe %llu blocks %d,%d\n",					       (unsigned long long)sh->sector, i, other);					compute_block_2(sh, i, other);					uptodate += 2;				} else if (test_bit(R5_Insync, &dev->flags)) {					set_bit(R5_LOCKED, &dev->flags);					set_bit(R5_Wantread, &dev->flags);#if 0					/* if I am just reading this block and we don't have					   a failed drive, or any pending writes then sidestep the cache */					if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&					    ! syncing && !failed && !to_write) {						sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;						sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;					}#endif					locked++;					PRINTK("Reading block %d (sync=%d)\n",						i, syncing);					if (syncing)						md_sync_acct(conf->disks[i].rdev->bdev,							     STRIPE_SECTORS);				}			}		}		set_bit(STRIPE_HANDLE, &sh->state);	}	/* now to consider writing and what else, if anything should be read */	if (to_write) {		int rcw=0, must_compute=0;		for (i=disks ; i--;) {			dev = &sh->dev[i];			/* Would I have to read this buffer for reconstruct_write */			if (!test_bit(R5_OVERWRITE, &dev->flags)			    && i != pd_idx && i != qd_idx			    && (!test_bit(R5_LOCKED, &dev->flags)#if 0				|| sh->bh_page[i] != bh->b_page#endif				    ) &&			    !test_bit(R5_UPTODATE, &dev->flags)) {				if (test_bit(R5_Insync, &dev->flags)) rcw++;				else {					PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);					must_compute++;				}			}		}		PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",		       (unsigned long long)sh->sector, rcw, must_compute);		set_bit(STRIPE_HANDLE, &sh->state);		if (rcw > 0)			/* want reconstruct write, but need to get some data */			for (i=disks; i--;) {				dev = &sh->dev[i];				if (!test_bit(R5_OVERWRITE, &dev->flags)				    && !(failed == 0 && (i == pd_idx || i == qd_idx))				    && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&				    test_bit(R5_Insync, &dev->flags)) {					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))					{						PRINTK("Read_old stripe %llu block %d for Reconstruct\n",						       (unsigned long long)sh->sector, i);						set_bit(R5_LOCKED, &dev->flags);						set_bit(R5_Wantread, &dev->flags);						locked++;					} else {						PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",						       (unsigned long long)sh->sector, i);						set_bit(STRIPE_DELAYED, &sh->state);						set_bit(STRIPE_HANDLE, &sh->state);					}				}			}		/* now if nothing is locked, and if we have enough data, we can start a write request */		if (locked == 0 && rcw == 0) {			if ( must_compute > 0 ) {				/* We have failed blocks and need to compute them */				switch ( failed ) {				case 0:	BUG();				case 1: compute_block_1(sh, failed_num[0]); break;				case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;				default: BUG();	/* This request should have been failed? */				}			}			PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);			compute_parity(sh, RECONSTRUCT_WRITE);			/* now every locked buffer is ready to be written */			for (i=disks; i--;)				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {					PRINTK("Writing stripe %llu block %d\n",					       (unsigned long long)sh->sector, i);					locked++;					set_bit(R5_Wantwrite, &sh->dev[i].flags);#if 0 /**** FIX: I don't understand the logic here... ****/					if (!test_bit(R5_Insync, &sh->dev[i].flags)					    || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */						set_bit(STRIPE_INSYNC, &sh->state);#endif				}			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {				atomic_dec(&conf->preread_active_stripes);				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)					md_wakeup_thread(conf->mddev->thread);			}		}	}	/* maybe we need to check and possibly fix the parity for this stripe	 * Any reads will already have been scheduled, so we just see if enough data	 * is available	 */	if (syncing && locked == 0 &&	    !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) {		set_bit(STRIPE_HANDLE, &sh->state);#if 0 /* RAID-6: Don't support CHECK PARITY yet */		if (failed == 0) {			char *pagea;			if (uptodate != disks)				BUG();			compute_parity(sh, CHECK_PARITY);			uptodate--;			pagea = page_address(sh->dev[pd_idx].page);			if ((*(u32*)pagea) == 0 &&			    !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {				/* parity is correct (on disc, not in buffer any more) */				set_bit(STRIPE_INSYNC, &sh->state);			}		}#endif		if (!test_bit(STRIPE_INSYNC, &sh->state)) {			int failed_needupdate[2];			struct r5dev *adev, *bdev;			if ( failed < 1 )				failed_num[0] = pd_idx;			if ( failed < 2 )				failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx;			failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags);			failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags);			PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n",			       failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]);#if 0  /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */			/* should be able to compute the missing block(s) and write to spare */			if ( failed_needupdate[0] ^ failed_needupdate[1] ) {				if (uptodate+1 != disks)					BUG();				compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]);				uptodate++;			} else if ( failed_needupdate[0] & failed_needupdate[1] ) {				if (uptodate+2 != disks)					BUG();				compute_block_2(sh, failed_num[0], failed_num[1]);				uptodate += 2;			}#else			compute_block_2(sh, failed_num[0], failed_num[1]);			uptodate += failed_needupdate[0] + failed_needupdate[1];#endif			if (uptodate != disks)				BUG();			PRINTK("Marking for sync stripe %llu blocks %d,%d\n",			       (unsigned long long)sh->sector, failed_num[0], failed_num[1]);			/**** FIX: Should we really do both of these unconditionally? ****/			adev = &sh->dev[failed_num[0]];			locked += !test_bit(R5_LOCKED, &adev->flags);			set_bit(R5_LOCKED, &adev->flags);			set_bit(R5_Wantwrite, &adev->flags);			bdev = &sh->dev[failed_num[1]];			locked += !test_bit(R5_LOCKED, &bdev->flags);			set_bit(R5_LOCKED, &bdev->flags);			set_bit(R5_Wantwrite, &bdev->flags);			set_bit(STRIPE_INSYNC, &sh->state);			set_bit(R5_Syncio, &adev->flags);			set_bit(R5_Syncio, &bdev->flags);		}	}	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {		md_done_sync(conf->mddev, STRIPE_SECTORS,1);		clear_bit(STRIPE_SYNCING, &sh->state);	}	spin_unlock(&sh->lock);	while ((bi=return_bi)) {		int bytes = bi->bi_size;		return_bi = bi->bi_next;		bi->bi_next = NULL;		bi->bi_size = 0;		bi->bi_end_io(bi, bytes, 0);	}	for (i=disks; i-- ;) {		int rw;		struct bio *bi;		mdk_rdev_t *rdev;		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))			rw = 1;		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))			rw = 0;		else			continue;		bi = &sh->dev[i].req;		bi->bi_rw = rw;		if (rw)			bi->bi_end_io = raid6_end_write_request;		else			bi->bi_end_io = raid6_end_read_request;		spin_lock_irq(&conf->device_lock);		rdev = conf->disks[i].rdev;		if (rdev && rdev->faulty)			rdev = NULL;		if (rdev)			atomic_inc(&rdev->nr_pending);		spin_unlock_irq(&conf->device_lock);		if (rdev) {			if (test_bit(R5_Syncio, &sh->dev[i].flags))				md_sync_acct(rdev->bdev, STRIPE_SECTORS);			bi->bi_bdev = rdev->bdev;			PRINTK("for %llu schedule op %ld on disc %d\n",				(unsigned long long)sh->sector, bi->bi_rw, i);			atomic_inc(&sh->count);			bi->bi_sector = sh->sector + rdev->data_offset;			bi->bi_flags = 1 << BIO_UPTODATE;			bi->bi_vcnt = 1;			bi->bi_idx = 0;			bi->bi_io_vec = &sh->dev[i].vec;			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;			bi->bi_io_vec[0].bv_offset = 0;			bi->bi_size = STRIPE_SIZE;			bi->bi_next = NULL;			generic_make_request(bi);		} else {			PRINTK("skip op %ld on disc %d for sector %llu\n",				bi->bi_rw, i, (unsigned long long)sh->sector);			clear_bit(R5_LOCKED, &sh->dev[i].flags);			set_bit(STRIPE_HANDLE, &sh->state);		}	}}static inline void raid6_activate_delayed(raid6_conf_t *conf){	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {		while (!list_empty(&conf->delayed_list)) {			struct list_head *l = conf->delayed_list.next;			struct stripe_head *sh;			sh = list_entry(l, struct stripe_head, lru);			list_del_init(l);			clear_bit(STRIPE_DELAYED, &sh->state);			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))				atomic_inc(&conf->preread_active_stripes);			list_add_tail(&sh->lru, &conf->handle_list);		}	}}static void unplug_slaves(mddev_t *mddev){	raid6_conf_t *conf = mddev_to_conf(mddev);	int i;	unsigned long flags;	spin_lock_irqsave(&conf->device_lock, flags);	for (i=0; i<mddev->raid_disks; i++) {		mdk_rdev_t *rdev = conf->disks[i].rdev;		if (rdev && atomic_read(&rdev->nr_pending)) {			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);			atomic_inc(&rdev->nr_pending);			spin_unlock_irqrestore(&conf->device_lock, flags);			if (r_queue && r_queue->unplug_fn)				r_queue->unplug_fn(r_queue);			spin_lock_irqsave(&conf->device_lock, flags);			atomic_dec(&rdev->nr_pending);		}	}	spin_unlock_irqrestore(&conf->device_lock, flags);}static void raid6_unplug_device(request_queue_t *q){	mddev_t *mddev = q->queuedata;	raid6_conf_t *conf = mddev_to_conf(mddev);	unsigned long flags;	spin_lock_irqsave(&conf->device_lock, flags);	if (blk_remove_plug(q))		raid6_activate_delayed(conf);	md_wakeup_thread(mddev->thread);	spin_unlock_irqrestore(&conf->device_lock, flags);	unplug_slaves(mddev);}static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,			     sector_t *error_sector){	mddev_t *mddev = q->queuedata;	raid6_conf_t *conf = mddev_to_conf(mddev);	int i, ret = 0;	for (i=0; i<mddev->raid_disks; i++) {		mdk_rdev_t *rdev = conf->disks[i].rdev;		if (rdev && !rdev->faulty) {			struct block_device *bdev = rdev->bdev;			request_queue_t *r_queue;			if (!bdev)				continue;			r_queue = bdev_get_queue(bdev);			if (!r_queue)				continue;			if (!r_queue->issue_flush_fn) {				ret = -EOPNOTSUPP;				break;			}			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);			if (ret)				break;		}	}	return ret;}static inline void raid6_plug_device(raid6_conf_t *conf){	spin_lock_irq(&conf->device_lock);	blk_plug_device(conf->mddev->queue);	spin_unlock_irq(&conf->device_lock);}static int make_request (request_queue_t *q, struct bio * bi){	mddev_t *mddev = q->queuedata;	raid6_conf_t *conf = mddev_to_conf(mddev);	const unsigned int raid_disks = conf->raid_disks;	const unsigned int data_disks = raid_disks - 2;	unsigned int dd_idx, pd_idx;	sector_t new_sector;	sector_t logical_sector, last_sector;	struct stripe_head *sh;	if (bio_data_dir(bi)==WRITE) {		disk_stat_inc(mddev->gendisk, writes);		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));	} else {		disk_stat_inc(mddev->gendisk, reads);		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));	}	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);	last_sector = bi->bi_sector + (bi->bi_size>>9);	bi->bi_next = NULL;	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */	if ( bio_data_dir(bi) == WRITE )		md_write_start(mddev);	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?