raid1.c

来自「linux 内核源代码」· C语言 代码 · 共 2,200 行 · 第 1/4 页

C
2,200
字号
static void end_sync_read(struct bio *bio, int error){	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);	int i;	for (i=r1_bio->mddev->raid_disks; i--; )		if (r1_bio->bios[i] == bio)			break;	BUG_ON(i < 0);	update_head_pos(i, r1_bio);	/*	 * we have read a block, now it needs to be re-written,	 * or re-read if the read failed.	 * We don't do much here, just schedule handling by raid1d	 */	if (test_bit(BIO_UPTODATE, &bio->bi_flags))		set_bit(R1BIO_Uptodate, &r1_bio->state);	if (atomic_dec_and_test(&r1_bio->remaining))		reschedule_retry(r1_bio);}static void end_sync_write(struct bio *bio, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);	mddev_t *mddev = r1_bio->mddev;	conf_t *conf = mddev_to_conf(mddev);	int i;	int mirror=0;	for (i = 0; i < conf->raid_disks; i++)		if (r1_bio->bios[i] == bio) {			mirror = i;			break;		}	if (!uptodate) {		int sync_blocks = 0;		sector_t s = r1_bio->sector;		long sectors_to_go = r1_bio->sectors;		/* make sure these bits doesn't get cleared. */		do {			bitmap_end_sync(mddev->bitmap, s,					&sync_blocks, 1);			s += sync_blocks;			sectors_to_go -= sync_blocks;		} while (sectors_to_go > 0);		md_error(mddev, conf->mirrors[mirror].rdev);	}	update_head_pos(mirror, r1_bio);	if (atomic_dec_and_test(&r1_bio->remaining)) {		md_done_sync(mddev, r1_bio->sectors, uptodate);		put_buf(r1_bio);	}}static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio){	conf_t *conf = mddev_to_conf(mddev);	int i;	int disks = conf->raid_disks;	struct bio *bio, *wbio;	bio = r1_bio->bios[r1_bio->read_disk];	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {		/* We have read all readable devices.  If we haven't		 * got the block, then there is no hope left.		 * If we have, then we want to do a comparison		 * and skip the write if everything is the same.		 * If any blocks failed to read, then we need to		 * attempt an over-write		 */		int primary;		if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {			for (i=0; i<mddev->raid_disks; i++)				if (r1_bio->bios[i]->bi_end_io == end_sync_read)					md_error(mddev, conf->mirrors[i].rdev);			md_done_sync(mddev, r1_bio->sectors, 1);			put_buf(r1_bio);			return;		}		for (primary=0; primary<mddev->raid_disks; primary++)			if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&			    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {				r1_bio->bios[primary]->bi_end_io = NULL;				rdev_dec_pending(conf->mirrors[primary].rdev, mddev);				break;			}		r1_bio->read_disk = primary;		for (i=0; i<mddev->raid_disks; i++)			if (r1_bio->bios[i]->bi_end_io == end_sync_read) {				int j;				int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);				struct bio *pbio = r1_bio->bios[primary];				struct bio *sbio = r1_bio->bios[i];				if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {					for (j = vcnt; j-- ; ) {						struct page *p, *s;						p = pbio->bi_io_vec[j].bv_page;						s = sbio->bi_io_vec[j].bv_page;						if (memcmp(page_address(p),							   page_address(s),							   PAGE_SIZE))							break;					}				} else					j = 0;				if (j >= 0)					mddev->resync_mismatches += r1_bio->sectors;				if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)					      && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {					sbio->bi_end_io = NULL;					rdev_dec_pending(conf->mirrors[i].rdev, mddev);				} else {					/* fixup the bio for reuse */					sbio->bi_vcnt = vcnt;					sbio->bi_size = r1_bio->sectors << 9;					sbio->bi_idx = 0;					sbio->bi_phys_segments = 0;					sbio->bi_hw_segments = 0;					sbio->bi_hw_front_size = 0;					sbio->bi_hw_back_size = 0;					sbio->bi_flags &= ~(BIO_POOL_MASK - 1);					sbio->bi_flags |= 1 << BIO_UPTODATE;					sbio->bi_next = NULL;					sbio->bi_sector = r1_bio->sector +						conf->mirrors[i].rdev->data_offset;					sbio->bi_bdev = conf->mirrors[i].rdev->bdev;					for (j = 0; j < vcnt ; j++)						memcpy(page_address(sbio->bi_io_vec[j].bv_page),						       page_address(pbio->bi_io_vec[j].bv_page),						       PAGE_SIZE);				}			}	}	if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {		/* ouch - failed to read all of that.		 * Try some synchronous reads of other devices to get		 * good data, much like with normal read errors.  Only		 * read into the pages we already have so we don't		 * need to re-issue the read request.		 * We don't need to freeze the array, because being in an		 * active sync request, there is no normal IO, and		 * no overlapping syncs.		 */		sector_t sect = r1_bio->sector;		int sectors = r1_bio->sectors;		int idx = 0;		while(sectors) {			int s = sectors;			int d = r1_bio->read_disk;			int success = 0;			mdk_rdev_t *rdev;			if (s > (PAGE_SIZE>>9))				s = PAGE_SIZE >> 9;			do {				if (r1_bio->bios[d]->bi_end_io == end_sync_read) {					/* No rcu protection needed here devices					 * can only be removed when no resync is					 * active, and resync is currently active					 */					rdev = conf->mirrors[d].rdev;					if (sync_page_io(rdev->bdev,							 sect + rdev->data_offset,							 s<<9,							 bio->bi_io_vec[idx].bv_page,							 READ)) {						success = 1;						break;					}				}				d++;				if (d == conf->raid_disks)					d = 0;			} while (!success && d != r1_bio->read_disk);			if (success) {				int start = d;				/* write it back and re-read */				set_bit(R1BIO_Uptodate, &r1_bio->state);				while (d != r1_bio->read_disk) {					if (d == 0)						d = conf->raid_disks;					d--;					if (r1_bio->bios[d]->bi_end_io != end_sync_read)						continue;					rdev = conf->mirrors[d].rdev;					atomic_add(s, &rdev->corrected_errors);					if (sync_page_io(rdev->bdev,							 sect + rdev->data_offset,							 s<<9,							 bio->bi_io_vec[idx].bv_page,							 WRITE) == 0)						md_error(mddev, rdev);				}				d = start;				while (d != r1_bio->read_disk) {					if (d == 0)						d = conf->raid_disks;					d--;					if (r1_bio->bios[d]->bi_end_io != end_sync_read)						continue;					rdev = conf->mirrors[d].rdev;					if (sync_page_io(rdev->bdev,							 sect + rdev->data_offset,							 s<<9,							 bio->bi_io_vec[idx].bv_page,							 READ) == 0)						md_error(mddev, rdev);				}			} else {				char b[BDEVNAME_SIZE];				/* Cannot read from anywhere, array is toast */				md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);				printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"				       " for block %llu\n",				       bdevname(bio->bi_bdev,b),				       (unsigned long long)r1_bio->sector);				md_done_sync(mddev, r1_bio->sectors, 0);				put_buf(r1_bio);				return;			}			sectors -= s;			sect += s;			idx ++;		}	}	/*	 * schedule writes	 */	atomic_set(&r1_bio->remaining, 1);	for (i = 0; i < disks ; i++) {		wbio = r1_bio->bios[i];		if (wbio->bi_end_io == NULL ||		    (wbio->bi_end_io == end_sync_read &&		     (i == r1_bio->read_disk ||		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))			continue;		wbio->bi_rw = WRITE;		wbio->bi_end_io = end_sync_write;		atomic_inc(&r1_bio->remaining);		md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);		generic_make_request(wbio);	}	if (atomic_dec_and_test(&r1_bio->remaining)) {		/* if we're here, all write(s) have completed, so clean up */		md_done_sync(mddev, r1_bio->sectors, 1);		put_buf(r1_bio);	}}/* * This is a kernel thread which: * *	1.	Retries failed read operations on working mirrors. *	2.	Updates the raid superblock when problems encounter. *	3.	Performs writes following reads for array syncronising. */static void fix_read_error(conf_t *conf, int read_disk,			   sector_t sect, int sectors){	mddev_t *mddev = conf->mddev;	while(sectors) {		int s = sectors;		int d = read_disk;		int success = 0;		int start;		mdk_rdev_t *rdev;		if (s > (PAGE_SIZE>>9))			s = PAGE_SIZE >> 9;		do {			/* Note: no rcu protection needed here			 * as this is synchronous in the raid1d thread			 * which is the thread that might remove			 * a device.  If raid1d ever becomes multi-threaded....			 */			rdev = conf->mirrors[d].rdev;			if (rdev &&			    test_bit(In_sync, &rdev->flags) &&			    sync_page_io(rdev->bdev,					 sect + rdev->data_offset,					 s<<9,					 conf->tmppage, READ))				success = 1;			else {				d++;				if (d == conf->raid_disks)					d = 0;			}		} while (!success && d != read_disk);		if (!success) {			/* Cannot read from anywhere -- bye bye array */			md_error(mddev, conf->mirrors[read_disk].rdev);			break;		}		/* write it back and re-read */		start = d;		while (d != read_disk) {			if (d==0)				d = conf->raid_disks;			d--;			rdev = conf->mirrors[d].rdev;			if (rdev &&			    test_bit(In_sync, &rdev->flags)) {				if (sync_page_io(rdev->bdev,						 sect + rdev->data_offset,						 s<<9, conf->tmppage, WRITE)				    == 0)					/* Well, this device is dead */					md_error(mddev, rdev);			}		}		d = start;		while (d != read_disk) {			char b[BDEVNAME_SIZE];			if (d==0)				d = conf->raid_disks;			d--;			rdev = conf->mirrors[d].rdev;			if (rdev &&			    test_bit(In_sync, &rdev->flags)) {				if (sync_page_io(rdev->bdev,						 sect + rdev->data_offset,						 s<<9, conf->tmppage, READ)				    == 0)					/* Well, this device is dead */					md_error(mddev, rdev);				else {					atomic_add(s, &rdev->corrected_errors);					printk(KERN_INFO					       "raid1:%s: read error corrected "					       "(%d sectors at %llu on %s)\n",					       mdname(mddev), s,					       (unsigned long long)(sect +					           rdev->data_offset),					       bdevname(rdev->bdev, b));				}			}		}		sectors -= s;		sect += s;	}}static void raid1d(mddev_t *mddev){	r1bio_t *r1_bio;	struct bio *bio;	unsigned long flags;	conf_t *conf = mddev_to_conf(mddev);	struct list_head *head = &conf->retry_list;	int unplug=0;	mdk_rdev_t *rdev;	md_check_recovery(mddev);		for (;;) {		char b[BDEVNAME_SIZE];		spin_lock_irqsave(&conf->device_lock, flags);		if (conf->pending_bio_list.head) {			bio = bio_list_get(&conf->pending_bio_list);			blk_remove_plug(mddev->queue);			spin_unlock_irqrestore(&conf->device_lock, flags);			/* flush any pending bitmap writes to disk before proceeding w/ I/O */			bitmap_unplug(mddev->bitmap);			while (bio) { /* submit pending writes */				struct bio *next = bio->bi_next;				bio->bi_next = NULL;				generic_make_request(bio);				bio = next;			}			unplug = 1;			continue;		}		if (list_empty(head))			break;		r1_bio = list_entry(head->prev, r1bio_t, retry_list);		list_del(head->prev);		conf->nr_queued--;		spin_unlock_irqrestore(&conf->device_lock, flags);		mddev = r1_bio->mddev;		conf = mddev_to_conf(mddev);		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {			sync_request_write(mddev, r1_bio);			unplug = 1;		} else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {			/* some requests in the r1bio were BIO_RW_BARRIER			 * requests which failed with -EOPNOTSUPP.  Hohumm..			 * Better resubmit without the barrier.			 * We know which devices to resubmit for, because			 * all others have had their bios[] entry cleared.			 * We already have a nr_pending reference on these rdevs.			 */			int i;			const int do_sync = bio_sync(r1_bio->master_bio);			clear_bit(R1BIO_BarrierRetry, &r1_bio->state);			clear_bit(R1BIO_Barrier, &r1_bio->state);			for (i=0; i < conf->raid_disks; i++)				if (r1_bio->bios[i])					atomic_inc(&r1_bio->remaining);			for (i=0; i < conf->raid_disks; i++)				if (r1_bio->bios[i]) {					struct bio_vec *bvec;					int j;					bio = bio_clone(r1_bio->master_bio, GFP_NOIO);					/* copy pages from the failed bio, as					 * this might be a write-behind device */					__bio_for_each_segment(bvec, bio, j, 0)						bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;					bio_put(r1_bio->bios[i]);					bio->bi_sector = r1_bio->sector +						conf->mirrors[i].rdev->data_offset;					bio->bi_bdev = conf->mirrors[i].rdev->bdev;					bio->bi_end_io = raid1_end_write_request;					bio->bi_rw = WRITE | do_sync;					bio->bi_private = r1_bio;					r1_bio->bios[i] = bio;					generic_make_request(bio);				}		} else {			int disk;			/* we got a read error. Maybe the drive is bad.  Maybe just			 * the block and we can fix it.			 * We freeze all other IO, and try reading the block from			 * other devices.  When we find one, we re-write			 * and check it that fixes the read error.			 * This is all done synchronously while the array is			 * frozen			 */			if (mddev->ro == 0) {				freeze_array(conf);				fix_read_error(conf, r1_bio->read_disk,					       r1_bio->sector,					       r1_bio->sectors);				unfreeze_array(conf);			}			bio = r1_bio->bios[r1_bio->read_disk];			if ((disk=read_balance(conf, r1_bio)) == -1) {				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"				       " read error for block %llu\n",				       bdevname(bio->bi_bdev,b),				       (unsigned long long)r1_bio->sector);				raid_end_bio_io(r1_bio);			} else {				const int do_sync = bio_sync(r1_bio->master_bio);				r1_bio->bios[r1_bio->read_disk] =					mddev->ro ? IO_BLOCKED : NULL;				r1_bio->read_disk = disk;				bio_put(bio);				bio = bio_clone(r1_bio->master_bio, GFP_NOIO);				r1_bio->bios[r1_bio->read_disk] = bio;				rdev = conf->mirrors[disk].rdev;				if (printk_ratelimit())					printk(KERN_ERR "raid1: %s: redirecting sector %llu to"					       " another mirror\n",					       bdevname(rdev->bdev,b),					       (unsigned long long)r1_bio->sector);				bio->bi_sector = r1_bio->sector + rdev->data_offset;				bio->bi_bdev = rdev->bdev;				bio->bi_end_io = raid1_end_read_request;				bio->bi_rw = READ | do_sync;				bio->bi_private = r1_bio;				unplug = 1;				generic_make_request(bio);			}		}	}	spin_unlock_irqrestore(&conf->device_lock, flags);	if (unplug)		unplug_slaves(mddev);}static int init_resync(conf_t *conf){	int buffs;	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;	BUG_ON(conf->r1buf_pool);	conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,					  conf->poolinfo);	if (!conf->r1buf_pool)		return -ENOMEM;	conf->next_resync = 0;	return 0;}/* * perform a "sync" on one "block" * * We need to make sure that no normal I/O request - particularly write * requests - conflict with active sync requests. * * This is achieved by tracking pending requests and a 'barrier' concept * that can be installed to exclude normal IO requests. */static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster){	conf_t *conf = mddev_to_conf(mddev);	r1bio_t *r1_bio;	struct bio *bio;	sector_t max_sector, nr_sectors;	int disk = -1;	int i;	int wonly = -1;	int write_targets = 0, read_targets = 0;	int sync_blocks;	int still_degraded = 0;	if (!conf->r1buf_pool)	{/*		printk("sync start - bitmap %p\n", mddev->bitmap);*/		if (init_resync(conf))			return 0;	}	max_sector = mddev->size << 1;	if (sector_nr >= max_sector) {		/* If we aborted, we need to abort the		 * sync on the 'current' bitmap chunk (there will		 * only be one in raid1 resync.		 * We can find the current addess in mddev->curr_resync

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?