raid10.c

来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 1,781 行 · 第 1/4 页

C
1,781
字号
	mempool_destroy(conf->r10buf_pool);	conf->r10buf_pool = NULL;}static int raid10_spare_active(mddev_t *mddev){	int i;	conf_t *conf = mddev->private;	mirror_info_t *tmp;	spin_lock_irq(&conf->device_lock);	/*	 * Find all non-in_sync disks within the RAID10 configuration	 * and mark them in_sync	 */	for (i = 0; i < conf->raid_disks; i++) {		tmp = conf->mirrors + i;		if (tmp->rdev		    && !tmp->rdev->faulty		    && !tmp->rdev->in_sync) {			conf->working_disks++;			mddev->degraded--;			tmp->rdev->in_sync = 1;		}	}	spin_unlock_irq(&conf->device_lock);	print_conf(conf);	return 0;}static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev){	conf_t *conf = mddev->private;	int found = 0;	int mirror;	mirror_info_t *p;	if (mddev->recovery_cp < MaxSector)		/* only hot-add to in-sync arrays, as recovery is		 * very different from resync		 */		return 0;	spin_lock_irq(&conf->device_lock);	for (mirror=0; mirror < mddev->raid_disks; mirror++)		if ( !(p=conf->mirrors+mirror)->rdev) {			p->rdev = rdev;			blk_queue_stack_limits(mddev->queue,					       rdev->bdev->bd_disk->queue);			/* as we don't honour merge_bvec_fn, we must never risk			 * violating it, so limit ->max_sector to one PAGE, as			 * a one page request is never in violation.			 */			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&			    mddev->queue->max_sectors > (PAGE_SIZE>>9))				mddev->queue->max_sectors = (PAGE_SIZE>>9);			p->head_position = 0;			rdev->raid_disk = mirror;			found = 1;			break;		}	spin_unlock_irq(&conf->device_lock);	print_conf(conf);	return found;}static int raid10_remove_disk(mddev_t *mddev, int number){	conf_t *conf = mddev->private;	int err = 1;	mirror_info_t *p = conf->mirrors+ number;	print_conf(conf);	spin_lock_irq(&conf->device_lock);	if (p->rdev) {		if (p->rdev->in_sync ||		    atomic_read(&p->rdev->nr_pending)) {			err = -EBUSY;			goto abort;		}		p->rdev = NULL;		err = 0;	}	if (err)		MD_BUG();abort:	spin_unlock_irq(&conf->device_lock);	print_conf(conf);	return err;}static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);	conf_t *conf = mddev_to_conf(r10_bio->mddev);	int i,d;	if (bio->bi_size)		return 1;	for (i=0; i<conf->copies; i++)		if (r10_bio->devs[i].bio == bio)			break;	if (i == conf->copies)		BUG();	update_head_pos(i, r10_bio);	d = r10_bio->devs[i].devnum;	if (!uptodate)		md_error(r10_bio->mddev,			 conf->mirrors[d].rdev);	/* for reconstruct, we always reschedule after a read.	 * for resync, only after all reads	 */	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||	    atomic_dec_and_test(&r10_bio->remaining)) {		/* we have read all the blocks,		 * do the comparison in process context in raid10d		 */		reschedule_retry(r10_bio);	}	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);	return 0;}static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);	mddev_t *mddev = r10_bio->mddev;	conf_t *conf = mddev_to_conf(mddev);	int i,d;	if (bio->bi_size)		return 1;	for (i = 0; i < conf->copies; i++)		if (r10_bio->devs[i].bio == bio)			break;	d = r10_bio->devs[i].devnum;	if (!uptodate)		md_error(mddev, conf->mirrors[d].rdev);	update_head_pos(i, r10_bio);	while (atomic_dec_and_test(&r10_bio->remaining)) {		if (r10_bio->master_bio == NULL) {			/* the primary of several recovery bios */			md_done_sync(mddev, r10_bio->sectors, 1);			put_buf(r10_bio);			break;		} else {			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;			put_buf(r10_bio);			r10_bio = r10_bio2;		}	}	rdev_dec_pending(conf->mirrors[d].rdev, mddev);	return 0;}/* * Note: sync and recover and handled very differently for raid10 * This code is for resync. * For resync, we read through virtual addresses and read all blocks. * If there is any error, we schedule a write.  The lowest numbered * drive is authoritative. * However requests come for physical address, so we need to map. * For every physical address there are raid_disks/copies virtual addresses, * which is always are least one, but is not necessarly an integer. * This means that a physical address can span multiple chunks, so we may * have to submit multiple io requests for a single sync request. *//* * We check if all blocks are in-sync and only write to blocks that * aren't in sync */static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio){	conf_t *conf = mddev_to_conf(mddev);	int i, first;	struct bio *tbio, *fbio;	atomic_set(&r10_bio->remaining, 1);	/* find the first device with a block */	for (i=0; i<conf->copies; i++)		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))			break;	if (i == conf->copies)		goto done;	first = i;	fbio = r10_bio->devs[i].bio;	/* now find blocks with errors */	for (i=first+1 ; i < conf->copies ; i++) {		int vcnt, j, d;		if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))			continue;		/* We know that the bi_io_vec layout is the same for		 * both 'first' and 'i', so we just compare them.		 * All vec entries are PAGE_SIZE;		 */		tbio = r10_bio->devs[i].bio;		vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);		for (j = 0; j < vcnt; j++)			if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),				   page_address(tbio->bi_io_vec[j].bv_page),				   PAGE_SIZE))				break;		if (j == vcnt)			continue;		/* Ok, we need to write this bio		 * First we need to fixup bv_offset, bv_len and		 * bi_vecs, as the read request might have corrupted these		 */		tbio->bi_vcnt = vcnt;		tbio->bi_size = r10_bio->sectors << 9;		tbio->bi_idx = 0;		tbio->bi_phys_segments = 0;		tbio->bi_hw_segments = 0;		tbio->bi_hw_front_size = 0;		tbio->bi_hw_back_size = 0;		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);		tbio->bi_flags |= 1 << BIO_UPTODATE;		tbio->bi_next = NULL;		tbio->bi_rw = WRITE;		tbio->bi_private = r10_bio;		tbio->bi_sector = r10_bio->devs[i].addr;		for (j=0; j < vcnt ; j++) {			tbio->bi_io_vec[j].bv_offset = 0;			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;			memcpy(page_address(tbio->bi_io_vec[j].bv_page),			       page_address(fbio->bi_io_vec[j].bv_page),			       PAGE_SIZE);		}		tbio->bi_end_io = end_sync_write;		d = r10_bio->devs[i].devnum;		atomic_inc(&conf->mirrors[d].rdev->nr_pending);		atomic_inc(&r10_bio->remaining);		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);		generic_make_request(tbio);	}done:	if (atomic_dec_and_test(&r10_bio->remaining)) {		md_done_sync(mddev, r10_bio->sectors, 1);		put_buf(r10_bio);	}}/* * Now for the recovery code. * Recovery happens across physical sectors. * We recover all non-is_sync drives by finding the virtual address of * each, and then choose a working drive that also has that virt address. * There is a separate r10_bio for each non-in_sync drive. * Only the first two slots are in use. The first for reading, * The second for writing. * */static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio){	conf_t *conf = mddev_to_conf(mddev);	int i, d;	struct bio *bio, *wbio;	/* move the pages across to the second bio	 * and submit the write request	 */	bio = r10_bio->devs[0].bio;	wbio = r10_bio->devs[1].bio;	for (i=0; i < wbio->bi_vcnt; i++) {		struct page *p = bio->bi_io_vec[i].bv_page;		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;		wbio->bi_io_vec[i].bv_page = p;	}	d = r10_bio->devs[1].devnum;	atomic_inc(&conf->mirrors[d].rdev->nr_pending);	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);	generic_make_request(wbio);}/* * This is a kernel thread which: * *	1.	Retries failed read operations on working mirrors. *	2.	Updates the raid superblock when problems encounter. *	3.	Performs writes following reads for array syncronising. */static void raid10d(mddev_t *mddev){	r10bio_t *r10_bio;	struct bio *bio;	unsigned long flags;	conf_t *conf = mddev_to_conf(mddev);	struct list_head *head = &conf->retry_list;	int unplug=0;	mdk_rdev_t *rdev;	md_check_recovery(mddev);	md_handle_safemode(mddev);	for (;;) {		char b[BDEVNAME_SIZE];		spin_lock_irqsave(&conf->device_lock, flags);		if (list_empty(head))			break;		r10_bio = list_entry(head->prev, r10bio_t, retry_list);		list_del(head->prev);		spin_unlock_irqrestore(&conf->device_lock, flags);		mddev = r10_bio->mddev;		conf = mddev_to_conf(mddev);		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {			sync_request_write(mddev, r10_bio);			unplug = 1;		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {			recovery_request_write(mddev, r10_bio);			unplug = 1;		} else {			int mirror;			bio = r10_bio->devs[r10_bio->read_slot].bio;			r10_bio->devs[r10_bio->read_slot].bio = NULL;			mirror = read_balance(conf, r10_bio);			r10_bio->devs[r10_bio->read_slot].bio = bio;			if (mirror == -1) {				printk(KERN_ALERT "raid10: %s: unrecoverable I/O"				       " read error for block %llu\n",				       bdevname(bio->bi_bdev,b),				       (unsigned long long)r10_bio->sector);				raid_end_bio_io(r10_bio);			} else {				rdev = conf->mirrors[mirror].rdev;				if (printk_ratelimit())					printk(KERN_ERR "raid10: %s: redirecting sector %llu to"					       " another mirror\n",					       bdevname(rdev->bdev,b),					       (unsigned long long)r10_bio->sector);				bio->bi_bdev = rdev->bdev;				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr					+ rdev->data_offset;				bio->bi_next = NULL;				bio->bi_flags &= (1<<BIO_CLONED);				bio->bi_flags |= 1 << BIO_UPTODATE;				bio->bi_idx = 0;				bio->bi_size = r10_bio->sectors << 9;				bio->bi_rw = READ;				unplug = 1;				generic_make_request(bio);			}		}	}	spin_unlock_irqrestore(&conf->device_lock, flags);	if (unplug)		unplug_slaves(mddev);}static int init_resync(conf_t *conf){	int buffs;	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;	if (conf->r10buf_pool)		BUG();	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);	if (!conf->r10buf_pool)		return -ENOMEM;	conf->next_resync = 0;	return 0;}/* * perform a "sync" on one "block" * * We need to make sure that no normal I/O request - particularly write * requests - conflict with active sync requests. * * This is achieved by tracking pending requests and a 'barrier' concept * that can be installed to exclude normal IO requests. * * Resync and recovery are handled very differently. * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. * * For resync, we iterate over virtual addresses, read all copies, * and update if there are differences.  If only one copy is live, * skip it. * For recovery, we iterate over physical addresses, read a good * value for each non-in_sync drive, and over-write. * * So, for recovery we may have several outstanding complex requests for a * given address, one for each out-of-sync device.  We model this by allocating * a number of r10_bio structures, one for each out-of-sync device. * As we setup these structures, we collect all bio's together into a list * which we then process collectively to add pages, and then process again * to pass to generic_make_request. * * The r10_bio structures are linked using a borrowed master_bio pointer. * This link is counted in ->remaining.  When the r10_bio that points to NULL * has its remaining count decremented to 0, the whole complex operation * is complete. * */static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster){	conf_t *conf = mddev_to_conf(mddev);	r10bio_t *r10_bio;	struct bio *biolist = NULL, *bio;	sector_t max_sector, nr_sectors;	int disk;	int i;	sector_t sectors_skipped = 0;	int chunks_skipped = 0;	if (!conf->r10buf_pool)		if (init_resync(conf))			return -ENOMEM; skipped:	max_sector = mddev->size << 1;	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))		max_sector = mddev->resync_max_sectors;	if (sector_nr >= max_sector) {		close_sync(conf);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?