raid10.c

来自「linux 内核源代码」· C语言 代码 · 共 2,188 行 · 第 1/4 页

C
2,188
字号
	mirror_info_t *p = conf->mirrors+ number;	print_conf(conf);	rdev = p->rdev;	if (rdev) {		if (test_bit(In_sync, &rdev->flags) ||		    atomic_read(&rdev->nr_pending)) {			err = -EBUSY;			goto abort;		}		p->rdev = NULL;		synchronize_rcu();		if (atomic_read(&rdev->nr_pending)) {			/* lost the race, try later */			err = -EBUSY;			p->rdev = rdev;		}	}abort:	print_conf(conf);	return err;}static void end_sync_read(struct bio *bio, int error){	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);	conf_t *conf = mddev_to_conf(r10_bio->mddev);	int i,d;	for (i=0; i<conf->copies; i++)		if (r10_bio->devs[i].bio == bio)			break;	BUG_ON(i == conf->copies);	update_head_pos(i, r10_bio);	d = r10_bio->devs[i].devnum;	if (test_bit(BIO_UPTODATE, &bio->bi_flags))		set_bit(R10BIO_Uptodate, &r10_bio->state);	else {		atomic_add(r10_bio->sectors,			   &conf->mirrors[d].rdev->corrected_errors);		if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))			md_error(r10_bio->mddev,				 conf->mirrors[d].rdev);	}	/* for reconstruct, we always reschedule after a read.	 * for resync, only after all reads	 */	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||	    atomic_dec_and_test(&r10_bio->remaining)) {		/* we have read all the blocks,		 * do the comparison in process context in raid10d		 */		reschedule_retry(r10_bio);	}	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);}static void end_sync_write(struct bio *bio, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);	mddev_t *mddev = r10_bio->mddev;	conf_t *conf = mddev_to_conf(mddev);	int i,d;	for (i = 0; i < conf->copies; i++)		if (r10_bio->devs[i].bio == bio)			break;	d = r10_bio->devs[i].devnum;	if (!uptodate)		md_error(mddev, conf->mirrors[d].rdev);	update_head_pos(i, r10_bio);	while (atomic_dec_and_test(&r10_bio->remaining)) {		if (r10_bio->master_bio == NULL) {			/* the primary of several recovery bios */			md_done_sync(mddev, r10_bio->sectors, 1);			put_buf(r10_bio);			break;		} else {			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;			put_buf(r10_bio);			r10_bio = r10_bio2;		}	}	rdev_dec_pending(conf->mirrors[d].rdev, mddev);}/* * Note: sync and recover and handled very differently for raid10 * This code is for resync. * For resync, we read through virtual addresses and read all blocks. * If there is any error, we schedule a write.  The lowest numbered * drive is authoritative. * However requests come for physical address, so we need to map. * For every physical address there are raid_disks/copies virtual addresses, * which is always are least one, but is not necessarly an integer. * This means that a physical address can span multiple chunks, so we may * have to submit multiple io requests for a single sync request. *//* * We check if all blocks are in-sync and only write to blocks that * aren't in sync */static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio){	conf_t *conf = mddev_to_conf(mddev);	int i, first;	struct bio *tbio, *fbio;	atomic_set(&r10_bio->remaining, 1);	/* find the first device with a block */	for (i=0; i<conf->copies; i++)		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))			break;	if (i == conf->copies)		goto done;	first = i;	fbio = r10_bio->devs[i].bio;	/* now find blocks with errors */	for (i=0 ; i < conf->copies ; i++) {		int  j, d;		int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);		tbio = r10_bio->devs[i].bio;		if (tbio->bi_end_io != end_sync_read)			continue;		if (i == first)			continue;		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {			/* We know that the bi_io_vec layout is the same for			 * both 'first' and 'i', so we just compare them.			 * All vec entries are PAGE_SIZE;			 */			for (j = 0; j < vcnt; j++)				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),					   page_address(tbio->bi_io_vec[j].bv_page),					   PAGE_SIZE))					break;			if (j == vcnt)				continue;			mddev->resync_mismatches += r10_bio->sectors;		}		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))			/* Don't fix anything. */			continue;		/* Ok, we need to write this bio		 * First we need to fixup bv_offset, bv_len and		 * bi_vecs, as the read request might have corrupted these		 */		tbio->bi_vcnt = vcnt;		tbio->bi_size = r10_bio->sectors << 9;		tbio->bi_idx = 0;		tbio->bi_phys_segments = 0;		tbio->bi_hw_segments = 0;		tbio->bi_hw_front_size = 0;		tbio->bi_hw_back_size = 0;		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);		tbio->bi_flags |= 1 << BIO_UPTODATE;		tbio->bi_next = NULL;		tbio->bi_rw = WRITE;		tbio->bi_private = r10_bio;		tbio->bi_sector = r10_bio->devs[i].addr;		for (j=0; j < vcnt ; j++) {			tbio->bi_io_vec[j].bv_offset = 0;			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;			memcpy(page_address(tbio->bi_io_vec[j].bv_page),			       page_address(fbio->bi_io_vec[j].bv_page),			       PAGE_SIZE);		}		tbio->bi_end_io = end_sync_write;		d = r10_bio->devs[i].devnum;		atomic_inc(&conf->mirrors[d].rdev->nr_pending);		atomic_inc(&r10_bio->remaining);		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);		tbio->bi_sector += conf->mirrors[d].rdev->data_offset;		tbio->bi_bdev = conf->mirrors[d].rdev->bdev;		generic_make_request(tbio);	}done:	if (atomic_dec_and_test(&r10_bio->remaining)) {		md_done_sync(mddev, r10_bio->sectors, 1);		put_buf(r10_bio);	}}/* * Now for the recovery code. * Recovery happens across physical sectors. * We recover all non-is_sync drives by finding the virtual address of * each, and then choose a working drive that also has that virt address. * There is a separate r10_bio for each non-in_sync drive. * Only the first two slots are in use. The first for reading, * The second for writing. * */static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio){	conf_t *conf = mddev_to_conf(mddev);	int i, d;	struct bio *bio, *wbio;	/* move the pages across to the second bio	 * and submit the write request	 */	bio = r10_bio->devs[0].bio;	wbio = r10_bio->devs[1].bio;	for (i=0; i < wbio->bi_vcnt; i++) {		struct page *p = bio->bi_io_vec[i].bv_page;		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;		wbio->bi_io_vec[i].bv_page = p;	}	d = r10_bio->devs[1].devnum;	atomic_inc(&conf->mirrors[d].rdev->nr_pending);	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);	if (test_bit(R10BIO_Uptodate, &r10_bio->state))		generic_make_request(wbio);	else		bio_endio(wbio, -EIO);}/* * This is a kernel thread which: * *	1.	Retries failed read operations on working mirrors. *	2.	Updates the raid superblock when problems encounter. *	3.	Performs writes following reads for array synchronising. */static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio){	int sect = 0; /* Offset from r10_bio->sector */	int sectors = r10_bio->sectors;	mdk_rdev_t*rdev;	while(sectors) {		int s = sectors;		int sl = r10_bio->read_slot;		int success = 0;		int start;		if (s > (PAGE_SIZE>>9))			s = PAGE_SIZE >> 9;		rcu_read_lock();		do {			int d = r10_bio->devs[sl].devnum;			rdev = rcu_dereference(conf->mirrors[d].rdev);			if (rdev &&			    test_bit(In_sync, &rdev->flags)) {				atomic_inc(&rdev->nr_pending);				rcu_read_unlock();				success = sync_page_io(rdev->bdev,						       r10_bio->devs[sl].addr +						       sect + rdev->data_offset,						       s<<9,						       conf->tmppage, READ);				rdev_dec_pending(rdev, mddev);				rcu_read_lock();				if (success)					break;			}			sl++;			if (sl == conf->copies)				sl = 0;		} while (!success && sl != r10_bio->read_slot);		rcu_read_unlock();		if (!success) {			/* Cannot read from anywhere -- bye bye array */			int dn = r10_bio->devs[r10_bio->read_slot].devnum;			md_error(mddev, conf->mirrors[dn].rdev);			break;		}		start = sl;		/* write it back and re-read */		rcu_read_lock();		while (sl != r10_bio->read_slot) {			int d;			if (sl==0)				sl = conf->copies;			sl--;			d = r10_bio->devs[sl].devnum;			rdev = rcu_dereference(conf->mirrors[d].rdev);			if (rdev &&			    test_bit(In_sync, &rdev->flags)) {				atomic_inc(&rdev->nr_pending);				rcu_read_unlock();				atomic_add(s, &rdev->corrected_errors);				if (sync_page_io(rdev->bdev,						 r10_bio->devs[sl].addr +						 sect + rdev->data_offset,						 s<<9, conf->tmppage, WRITE)				    == 0)					/* Well, this device is dead */					md_error(mddev, rdev);				rdev_dec_pending(rdev, mddev);				rcu_read_lock();			}		}		sl = start;		while (sl != r10_bio->read_slot) {			int d;			if (sl==0)				sl = conf->copies;			sl--;			d = r10_bio->devs[sl].devnum;			rdev = rcu_dereference(conf->mirrors[d].rdev);			if (rdev &&			    test_bit(In_sync, &rdev->flags)) {				char b[BDEVNAME_SIZE];				atomic_inc(&rdev->nr_pending);				rcu_read_unlock();				if (sync_page_io(rdev->bdev,						 r10_bio->devs[sl].addr +						 sect + rdev->data_offset,						 s<<9, conf->tmppage, READ) == 0)					/* Well, this device is dead */					md_error(mddev, rdev);				else					printk(KERN_INFO					       "raid10:%s: read error corrected"					       " (%d sectors at %llu on %s)\n",					       mdname(mddev), s,					       (unsigned long long)(sect+					            rdev->data_offset),					       bdevname(rdev->bdev, b));				rdev_dec_pending(rdev, mddev);				rcu_read_lock();			}		}		rcu_read_unlock();		sectors -= s;		sect += s;	}}static void raid10d(mddev_t *mddev){	r10bio_t *r10_bio;	struct bio *bio;	unsigned long flags;	conf_t *conf = mddev_to_conf(mddev);	struct list_head *head = &conf->retry_list;	int unplug=0;	mdk_rdev_t *rdev;	md_check_recovery(mddev);	for (;;) {		char b[BDEVNAME_SIZE];		spin_lock_irqsave(&conf->device_lock, flags);		if (conf->pending_bio_list.head) {			bio = bio_list_get(&conf->pending_bio_list);			blk_remove_plug(mddev->queue);			spin_unlock_irqrestore(&conf->device_lock, flags);			/* flush any pending bitmap writes to disk before proceeding w/ I/O */			bitmap_unplug(mddev->bitmap);			while (bio) { /* submit pending writes */				struct bio *next = bio->bi_next;				bio->bi_next = NULL;				generic_make_request(bio);				bio = next;			}			unplug = 1;			continue;		}		if (list_empty(head))			break;		r10_bio = list_entry(head->prev, r10bio_t, retry_list);		list_del(head->prev);		conf->nr_queued--;		spin_unlock_irqrestore(&conf->device_lock, flags);		mddev = r10_bio->mddev;		conf = mddev_to_conf(mddev);		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {			sync_request_write(mddev, r10_bio);			unplug = 1;		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {			recovery_request_write(mddev, r10_bio);			unplug = 1;		} else {			int mirror;			/* we got a read error. Maybe the drive is bad.  Maybe just			 * the block and we can fix it.			 * We freeze all other IO, and try reading the block from			 * other devices.  When we find one, we re-write			 * and check it that fixes the read error.			 * This is all done synchronously while the array is			 * frozen.			 */			if (mddev->ro == 0) {				freeze_array(conf);				fix_read_error(conf, mddev, r10_bio);				unfreeze_array(conf);			}			bio = r10_bio->devs[r10_bio->read_slot].bio;			r10_bio->devs[r10_bio->read_slot].bio =				mddev->ro ? IO_BLOCKED : NULL;			mirror = read_balance(conf, r10_bio);			if (mirror == -1) {				printk(KERN_ALERT "raid10: %s: unrecoverable I/O"				       " read error for block %llu\n",				       bdevname(bio->bi_bdev,b),				       (unsigned long long)r10_bio->sector);				raid_end_bio_io(r10_bio);				bio_put(bio);			} else {				const int do_sync = bio_sync(r10_bio->master_bio);				bio_put(bio);				rdev = conf->mirrors[mirror].rdev;				if (printk_ratelimit())					printk(KERN_ERR "raid10: %s: redirecting sector %llu to"					       " another mirror\n",					       bdevname(rdev->bdev,b),					       (unsigned long long)r10_bio->sector);				bio = bio_clone(r10_bio->master_bio, GFP_NOIO);				r10_bio->devs[r10_bio->read_slot].bio = bio;				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr					+ rdev->data_offset;				bio->bi_bdev = rdev->bdev;				bio->bi_rw = READ | do_sync;				bio->bi_private = r10_bio;				bio->bi_end_io = raid10_end_read_request;				unplug = 1;				generic_make_request(bio);			}		}	}	spin_unlock_irqrestore(&conf->device_lock, flags);	if (unplug)		unplug_slaves(mddev);}static int init_resync(conf_t *conf){	int buffs;	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;	BUG_ON(conf->r10buf_pool);	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);	if (!conf->r10buf_pool)		return -ENOMEM;	conf->next_resync = 0;	return 0;}/* * perform a "sync" on one "block" * * We need to make sure that no normal I/O request - particularly write * requests - conflict with active sync requests. * * This is achieved by tracking pending requests and a 'barrier' concept * that can be installed to exclude normal IO requests. * * Resync and recovery are handled very differently. * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. * * For resync, we iterate over virtual addresses, read all copies, * and update if there are differences.  If only one copy is live, * skip it. * For recovery, we iterate over physical addresses, read a good * value for each non-in_sync drive, and over-write. * * So, for recovery we may have several outstanding complex requests for a * given address, one for each out-of-sync device.  We model this by allocating * a number of r10_bio structures, one for each out-of-sync device. * As we setup these structures, we collect all bio's together into a list * which we then process collectively to add pages, and then process again * to pass to generic_make_request. * * The r10_bio structures are linked using a borrowed master_bio pointer. * This link is counted in ->remaining.  When the r10_bio that points to NULL * has its remaining count decremented to 0, the whole complex operation * is complete. * */static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster){	conf_t *conf = mddev_to_conf(mddev);	r10bio_t *r10_bio;	struct bio *biolist = NULL, *bio;	sector_t max_sector, nr_sectors;	int disk;	int i;	int max_sync;	int sync_blocks;	sector_t sectors_skipped = 0;	int chunks_skipped = 0;	if (!conf->r10buf_pool)		if (init_resync(conf))			return 0; skipped:	max_sector = mddev->size << 1;	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))		max_sector = mddev->resync_max_sectors;	if (sector_nr >= max_sector) {		/* If we aborted, we need to abort the		 * sync on the 'current' bitmap chucks (there can		 * be several when recovering multiple devices).		 * as we may have started syncing it but not finished.		 * We can find the current address in		 * mddev->curr_resync, but for recovery,		 * we need to convert that to several		 * virtual addresses.		 */		if (mddev->curr_resync < max_sector) { /* aborted */			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,						&sync_blocks, 1);			else for (i=0; i<conf->raid_disks; i++) {				sector_t sect =					raid10_find_virt(conf, mddev->curr_resync, i);				bitmap_end_sync(mddev->bitmap, sect,

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?