raid5.c

来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 1,920 行 · 第 1/4 页

C
1,920
字号
			bi = sh->dev[i].written;			sh->dev[i].written = NULL;			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);				clear_bit(BIO_UPTODATE, &bi->bi_flags);				if (--bi->bi_phys_segments == 0) {					md_write_end(conf->mddev);					bi->bi_next = return_bi;					return_bi = bi;				}				bi = bi2;			}			/* fail any reads if this device is non-operational */			if (!test_bit(R5_Insync, &sh->dev[i].flags)) {				bi = sh->dev[i].toread;				sh->dev[i].toread = NULL;				if (bi) to_read--;				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);					clear_bit(BIO_UPTODATE, &bi->bi_flags);					if (--bi->bi_phys_segments == 0) {						bi->bi_next = return_bi;						return_bi = bi;					}					bi = nextbi;				}			}		}		spin_unlock_irq(&conf->device_lock);	}	if (failed > 1 && syncing) {		md_done_sync(conf->mddev, STRIPE_SECTORS,0);		clear_bit(STRIPE_SYNCING, &sh->state);		syncing = 0;	}	/* might be able to return some write requests if the parity block	 * is safe, or on a failed drive	 */	dev = &sh->dev[sh->pd_idx];	if ( written &&	     ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&		test_bit(R5_UPTODATE, &dev->flags))	       || (failed == 1 && failed_num == sh->pd_idx))	    ) {	    /* any written block on an uptodate or failed drive can be returned.	     * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 	     * never LOCKED, so we don't need to test 'failed' directly.	     */	    for (i=disks; i--; )		if (sh->dev[i].written) {		    dev = &sh->dev[i];		    if (!test_bit(R5_LOCKED, &dev->flags) &&			 test_bit(R5_UPTODATE, &dev->flags) ) {			/* We can return any write requests */			    struct bio *wbi, *wbi2;			    PRINTK("Return write for disc %d\n", i);			    spin_lock_irq(&conf->device_lock);			    wbi = dev->written;			    dev->written = NULL;			    while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {				    wbi2 = r5_next_bio(wbi, dev->sector);				    if (--wbi->bi_phys_segments == 0) {					    md_write_end(conf->mddev);					    wbi->bi_next = return_bi;					    return_bi = wbi;				    }				    wbi = wbi2;			    }			    spin_unlock_irq(&conf->device_lock);		    }		}	}	/* Now we might consider reading some blocks, either to check/generate	 * parity, or to satisfy requests	 * or to load a block that is being partially written.	 */	if (to_read || non_overwrite || (syncing && (uptodate < disks))) {		for (i=disks; i--;) {			dev = &sh->dev[i];			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&			    (dev->toread ||			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||			     syncing ||			     (failed && (sh->dev[failed_num].toread ||					 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))				    )				) {				/* we would like to get this block, possibly				 * by computing it, but we might not be able to				 */				if (uptodate == disks-1) {					PRINTK("Computing block %d\n", i);					compute_block(sh, i);					uptodate++;				} else if (test_bit(R5_Insync, &dev->flags)) {					set_bit(R5_LOCKED, &dev->flags);					set_bit(R5_Wantread, &dev->flags);#if 0					/* if I am just reading this block and we don't have					   a failed drive, or any pending writes then sidestep the cache */					if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&					    ! syncing && !failed && !to_write) {						sh->bh_cache[i]->b_page =  sh->bh_read[i]->b_page;						sh->bh_cache[i]->b_data =  sh->bh_read[i]->b_data;					}#endif					locked++;					PRINTK("Reading block %d (sync=%d)\n", 						i, syncing);					if (syncing)						md_sync_acct(conf->disks[i].rdev->bdev,							     STRIPE_SECTORS);				}			}		}		set_bit(STRIPE_HANDLE, &sh->state);	}	/* now to consider writing and what else, if anything should be read */	if (to_write) {		int rmw=0, rcw=0;		for (i=disks ; i--;) {			/* would I have to read this buffer for read_modify_write */			dev = &sh->dev[i];			if ((dev->towrite || i == sh->pd_idx) &&			    (!test_bit(R5_LOCKED, &dev->flags) #if 0|| sh->bh_page[i]!=bh->b_page#endif				    ) &&			    !test_bit(R5_UPTODATE, &dev->flags)) {				if (test_bit(R5_Insync, &dev->flags)/*				    && !(!mddev->insync && i == sh->pd_idx) */					)					rmw++;				else rmw += 2*disks;  /* cannot read it */			}			/* Would I have to read this buffer for reconstruct_write */			if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&			    (!test_bit(R5_LOCKED, &dev->flags) #if 0|| sh->bh_page[i] != bh->b_page#endif				    ) &&			    !test_bit(R5_UPTODATE, &dev->flags)) {				if (test_bit(R5_Insync, &dev->flags)) rcw++;				else rcw += 2*disks;			}		}		PRINTK("for sector %llu, rmw=%d rcw=%d\n", 			(unsigned long long)sh->sector, rmw, rcw);		set_bit(STRIPE_HANDLE, &sh->state);		if (rmw < rcw && rmw > 0)			/* prefer read-modify-write, but need to get some data */			for (i=disks; i--;) {				dev = &sh->dev[i];				if ((dev->towrite || i == sh->pd_idx) &&				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&				    test_bit(R5_Insync, &dev->flags)) {					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))					{						PRINTK("Read_old block %d for r-m-w\n", i);						set_bit(R5_LOCKED, &dev->flags);						set_bit(R5_Wantread, &dev->flags);						locked++;					} else {						set_bit(STRIPE_DELAYED, &sh->state);						set_bit(STRIPE_HANDLE, &sh->state);					}				}			}		if (rcw <= rmw && rcw > 0)			/* want reconstruct write, but need to get some data */			for (i=disks; i--;) {				dev = &sh->dev[i];				if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&				    !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&				    test_bit(R5_Insync, &dev->flags)) {					if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))					{						PRINTK("Read_old block %d for Reconstruct\n", i);						set_bit(R5_LOCKED, &dev->flags);						set_bit(R5_Wantread, &dev->flags);						locked++;					} else {						set_bit(STRIPE_DELAYED, &sh->state);						set_bit(STRIPE_HANDLE, &sh->state);					}				}			}		/* now if nothing is locked, and if we have enough data, we can start a write request */		if (locked == 0 && (rcw == 0 ||rmw == 0)) {			PRINTK("Computing parity...\n");			compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);			/* now every locked buffer is ready to be written */			for (i=disks; i--;)				if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {					PRINTK("Writing block %d\n", i);					locked++;					set_bit(R5_Wantwrite, &sh->dev[i].flags);					if (!test_bit(R5_Insync, &sh->dev[i].flags)					    || (i==sh->pd_idx && failed == 0))						set_bit(STRIPE_INSYNC, &sh->state);				}			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {				atomic_dec(&conf->preread_active_stripes);				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)					md_wakeup_thread(conf->mddev->thread);			}		}	}	/* maybe we need to check and possibly fix the parity for this stripe	 * Any reads will already have been scheduled, so we just see if enough data	 * is available	 */	if (syncing && locked == 0 &&	    !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {		set_bit(STRIPE_HANDLE, &sh->state);		if (failed == 0) {			char *pagea;			if (uptodate != disks)				BUG();			compute_parity(sh, CHECK_PARITY);			uptodate--;			pagea = page_address(sh->dev[sh->pd_idx].page);			if ((*(u32*)pagea) == 0 &&			    !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {				/* parity is correct (on disc, not in buffer any more) */				set_bit(STRIPE_INSYNC, &sh->state);			}		}		if (!test_bit(STRIPE_INSYNC, &sh->state)) {			if (failed==0)				failed_num = sh->pd_idx;			/* should be able to compute the missing block and write it to spare */			if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {				if (uptodate+1 != disks)					BUG();				compute_block(sh, failed_num);				uptodate++;			}			if (uptodate != disks)				BUG();			dev = &sh->dev[failed_num];			set_bit(R5_LOCKED, &dev->flags);			set_bit(R5_Wantwrite, &dev->flags);			locked++;			set_bit(STRIPE_INSYNC, &sh->state);			set_bit(R5_Syncio, &dev->flags);		}	}	if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {		md_done_sync(conf->mddev, STRIPE_SECTORS,1);		clear_bit(STRIPE_SYNCING, &sh->state);	}		spin_unlock(&sh->lock);	while ((bi=return_bi)) {		int bytes = bi->bi_size;		return_bi = bi->bi_next;		bi->bi_next = NULL;		bi->bi_size = 0;		bi->bi_end_io(bi, bytes, 0);	}	for (i=disks; i-- ;) {		int rw;		struct bio *bi;		mdk_rdev_t *rdev;		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))			rw = 1;		else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))			rw = 0;		else			continue; 		bi = &sh->dev[i].req; 		bi->bi_rw = rw;		if (rw)			bi->bi_end_io = raid5_end_write_request;		else			bi->bi_end_io = raid5_end_read_request; 		spin_lock_irq(&conf->device_lock);		rdev = conf->disks[i].rdev;		if (rdev && rdev->faulty)			rdev = NULL;		if (rdev)			atomic_inc(&rdev->nr_pending);		spin_unlock_irq(&conf->device_lock); 		if (rdev) {			if (test_bit(R5_Syncio, &sh->dev[i].flags))				md_sync_acct(rdev->bdev, STRIPE_SECTORS);			bi->bi_bdev = rdev->bdev;			PRINTK("for %llu schedule op %ld on disc %d\n",				(unsigned long long)sh->sector, bi->bi_rw, i);			atomic_inc(&sh->count);			bi->bi_sector = sh->sector + rdev->data_offset;			bi->bi_flags = 1 << BIO_UPTODATE;			bi->bi_vcnt = 1;				bi->bi_idx = 0;			bi->bi_io_vec = &sh->dev[i].vec;			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;			bi->bi_io_vec[0].bv_offset = 0;			bi->bi_size = STRIPE_SIZE;			bi->bi_next = NULL;			generic_make_request(bi);		} else {			PRINTK("skip op %ld on disc %d for sector %llu\n",				bi->bi_rw, i, (unsigned long long)sh->sector);			clear_bit(R5_LOCKED, &sh->dev[i].flags);			set_bit(STRIPE_HANDLE, &sh->state);		}	}}static inline void raid5_activate_delayed(raid5_conf_t *conf){	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {		while (!list_empty(&conf->delayed_list)) {			struct list_head *l = conf->delayed_list.next;			struct stripe_head *sh;			sh = list_entry(l, struct stripe_head, lru);			list_del_init(l);			clear_bit(STRIPE_DELAYED, &sh->state);			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))				atomic_inc(&conf->preread_active_stripes);			list_add_tail(&sh->lru, &conf->handle_list);		}	}}static void unplug_slaves(mddev_t *mddev){	raid5_conf_t *conf = mddev_to_conf(mddev);	int i;	unsigned long flags;	spin_lock_irqsave(&conf->device_lock, flags);	for (i=0; i<mddev->raid_disks; i++) {		mdk_rdev_t *rdev = conf->disks[i].rdev;		if (rdev && atomic_read(&rdev->nr_pending)) {			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);			atomic_inc(&rdev->nr_pending);			spin_unlock_irqrestore(&conf->device_lock, flags);			if (r_queue && r_queue->unplug_fn)				r_queue->unplug_fn(r_queue);			spin_lock_irqsave(&conf->device_lock, flags);			atomic_dec(&rdev->nr_pending);		}	}	spin_unlock_irqrestore(&conf->device_lock, flags);}static void raid5_unplug_device(request_queue_t *q){	mddev_t *mddev = q->queuedata;	raid5_conf_t *conf = mddev_to_conf(mddev);	unsigned long flags;	spin_lock_irqsave(&conf->device_lock, flags);	if (blk_remove_plug(q))		raid5_activate_delayed(conf);	md_wakeup_thread(mddev->thread);	spin_unlock_irqrestore(&conf->device_lock, flags);	unplug_slaves(mddev);}static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,			     sector_t *error_sector){	mddev_t *mddev = q->queuedata;	raid5_conf_t *conf = mddev_to_conf(mddev);	int i, ret = 0;	for (i=0; i<mddev->raid_disks; i++) {		mdk_rdev_t *rdev = conf->disks[i].rdev;		if (rdev && !rdev->faulty) {			struct block_device *bdev = rdev->bdev;			request_queue_t *r_queue;			if (!bdev)				continue;			r_queue = bdev_get_queue(bdev);			if (!r_queue)				continue;			if (!r_queue->issue_flush_fn) {				ret = -EOPNOTSUPP;				break;			}			ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);			if (ret)				break;		}	}	return ret;}static inline void raid5_plug_device(raid5_conf_t *conf){	spin_lock_irq(&conf->device_lock);	blk_plug_device(conf->mddev->queue);	spin_unlock_irq(&conf->device_lock);}static int make_request (request_queue_t *q, struct bio * bi){	mddev_t *mddev = q->queuedata;	raid5_conf_t *conf = mddev_to_conf(mddev);	const unsigned int raid_disks = conf->raid_disks;	const unsigned int data_disks = raid_disks - 1;	unsigned int dd_idx, pd_idx;	sector_t new_sector;	sector_t logical_sector, last_sector;	struct stripe_head *sh;	if (bio_data_dir(bi)==WRITE) {		disk_stat_inc(mddev->gendisk, writes);		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));	} else {		disk_stat_inc(mddev->gendisk, reads);		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));	}	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);	last_sector = bi->bi_sector + (bi->bi_size>>9);	bi->bi_next = NULL;	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */	if ( bio_data_dir(bi) == WRITE )		md_write_start(mddev);	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {				new_sector = raid5_compute_sector(logical_sector,						  raid_disks, data_disks, &dd_idx, &pd_idx, conf);		PRINTK("raid5: make_request, sector %Lu logical %Lu\n",			(unsigned long long)new_sector, 			(unsigned long long)logical_sector);		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));		if (sh) {			add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));			raid5_plug_device(conf);			handle_stripe(sh);			release_stripe(sh);		} else {			/* cannot get stripe for read-ahead, just give-up */			clear_bit(BIO_UPTODATE, &bi->bi_flags);			break;		}				}	spin_lock_irq(&conf->device_lock);	if (--bi->bi_phys_segments == 0) {		int bytes = bi->bi_size;		if ( bio_data_dir(bi) == WRITE )			md_write_end(mddev);		bi->bi_size = 0;		bi->bi_end_io(bi, bytes, 0);	}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?