raid10.c

来自「linux 内核源代码」· C语言 代码 · 共 2,188 行 · 第 1/4 页

C
2,188
字号
		    !test_bit(In_sync, &rdev->flags))			continue;		/* This optimisation is debatable, and completely destroys		 * sequential read speed for 'far copies' arrays.  So only		 * keep it for 'near' arrays, and review those later.		 */		if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {			disk = ndisk;			slot = nslot;			break;		}		new_distance = abs(r10_bio->devs[nslot].addr -				   conf->mirrors[ndisk].head_position);		if (new_distance < current_distance) {			current_distance = new_distance;			disk = ndisk;			slot = nslot;		}	}rb_out:	r10_bio->read_slot = slot;/*	conf->next_seq_sect = this_sector + sectors;*/	if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)		atomic_inc(&conf->mirrors[disk].rdev->nr_pending);	else		disk = -1;	rcu_read_unlock();	return disk;}static void unplug_slaves(mddev_t *mddev){	conf_t *conf = mddev_to_conf(mddev);	int i;	rcu_read_lock();	for (i=0; i<mddev->raid_disks; i++) {		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);			atomic_inc(&rdev->nr_pending);			rcu_read_unlock();			blk_unplug(r_queue);			rdev_dec_pending(rdev, mddev);			rcu_read_lock();		}	}	rcu_read_unlock();}static void raid10_unplug(struct request_queue *q){	mddev_t *mddev = q->queuedata;	unplug_slaves(q->queuedata);	md_wakeup_thread(mddev->thread);}static int raid10_congested(void *data, int bits){	mddev_t *mddev = data;	conf_t *conf = mddev_to_conf(mddev);	int i, ret = 0;	rcu_read_lock();	for (i = 0; i < mddev->raid_disks && ret == 0; i++) {		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);		if (rdev && !test_bit(Faulty, &rdev->flags)) {			struct request_queue *q = bdev_get_queue(rdev->bdev);			ret |= bdi_congested(&q->backing_dev_info, bits);		}	}	rcu_read_unlock();	return ret;}/* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. * To do this we raise a 'barrier'. * The 'barrier' is a counter that can be raised multiple times * to count how many activities are happening which preclude * normal IO. * We can only raise the barrier if there is no pending IO. * i.e. if nr_pending == 0. * We choose only to raise the barrier if no-one is waiting for the * barrier to go down.  This means that as soon as an IO request * is ready, no other operations which require a barrier will start * until the IO request has had a chance. * * So: regular IO calls 'wait_barrier'.  When that returns there *    is no backgroup IO happening,  It must arrange to call *    allow_barrier when it has finished its IO. * backgroup IO calls must call raise_barrier.  Once that returns *    there is no normal IO happeing.  It must arrange to call *    lower_barrier when the particular background IO completes. */#define RESYNC_DEPTH 32static void raise_barrier(conf_t *conf, int force){	BUG_ON(force && !conf->barrier);	spin_lock_irq(&conf->resync_lock);	/* Wait until no block IO is waiting (unless 'force') */	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,			    conf->resync_lock,			    raid10_unplug(conf->mddev->queue));	/* block any new IO from starting */	conf->barrier++;	/* No wait for all pending IO to complete */	wait_event_lock_irq(conf->wait_barrier,			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,			    conf->resync_lock,			    raid10_unplug(conf->mddev->queue));	spin_unlock_irq(&conf->resync_lock);}static void lower_barrier(conf_t *conf){	unsigned long flags;	spin_lock_irqsave(&conf->resync_lock, flags);	conf->barrier--;	spin_unlock_irqrestore(&conf->resync_lock, flags);	wake_up(&conf->wait_barrier);}static void wait_barrier(conf_t *conf){	spin_lock_irq(&conf->resync_lock);	if (conf->barrier) {		conf->nr_waiting++;		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,				    conf->resync_lock,				    raid10_unplug(conf->mddev->queue));		conf->nr_waiting--;	}	conf->nr_pending++;	spin_unlock_irq(&conf->resync_lock);}static void allow_barrier(conf_t *conf){	unsigned long flags;	spin_lock_irqsave(&conf->resync_lock, flags);	conf->nr_pending--;	spin_unlock_irqrestore(&conf->resync_lock, flags);	wake_up(&conf->wait_barrier);}static void freeze_array(conf_t *conf){	/* stop syncio and normal IO and wait for everything to	 * go quiet.	 * We increment barrier and nr_waiting, and then	 * wait until barrier+nr_pending match nr_queued+2	 */	spin_lock_irq(&conf->resync_lock);	conf->barrier++;	conf->nr_waiting++;	wait_event_lock_irq(conf->wait_barrier,			    conf->barrier+conf->nr_pending == conf->nr_queued+2,			    conf->resync_lock,			    raid10_unplug(conf->mddev->queue));	spin_unlock_irq(&conf->resync_lock);}static void unfreeze_array(conf_t *conf){	/* reverse the effect of the freeze */	spin_lock_irq(&conf->resync_lock);	conf->barrier--;	conf->nr_waiting--;	wake_up(&conf->wait_barrier);	spin_unlock_irq(&conf->resync_lock);}static int make_request(struct request_queue *q, struct bio * bio){	mddev_t *mddev = q->queuedata;	conf_t *conf = mddev_to_conf(mddev);	mirror_info_t *mirror;	r10bio_t *r10_bio;	struct bio *read_bio;	int i;	int chunk_sects = conf->chunk_mask + 1;	const int rw = bio_data_dir(bio);	const int do_sync = bio_sync(bio);	struct bio_list bl;	unsigned long flags;	if (unlikely(bio_barrier(bio))) {		bio_endio(bio, -EOPNOTSUPP);		return 0;	}	/* If this request crosses a chunk boundary, we need to	 * split it.  This will only happen for 1 PAGE (or less) requests.	 */	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)		      > chunk_sects &&		    conf->near_copies < conf->raid_disks)) {		struct bio_pair *bp;		/* Sanity check -- queue functions should prevent this happening */		if (bio->bi_vcnt != 1 ||		    bio->bi_idx != 0)			goto bad_map;		/* This is a one page bio that upper layers		 * refuse to split for us, so we need to split it.		 */		bp = bio_split(bio, bio_split_pool,			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );		if (make_request(q, &bp->bio1))			generic_make_request(&bp->bio1);		if (make_request(q, &bp->bio2))			generic_make_request(&bp->bio2);		bio_pair_release(bp);		return 0;	bad_map:		printk("raid10_make_request bug: can't convert block across chunks"		       " or bigger than %dk %llu %d\n", chunk_sects/2,		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);		bio_io_error(bio);		return 0;	}	md_write_start(mddev, bio);	/*	 * Register the new request and wait if the reconstruction	 * thread has put up a bar for new requests.	 * Continue immediately if no resync is active currently.	 */	wait_barrier(conf);	disk_stat_inc(mddev->gendisk, ios[rw]);	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);	r10_bio->master_bio = bio;	r10_bio->sectors = bio->bi_size >> 9;	r10_bio->mddev = mddev;	r10_bio->sector = bio->bi_sector;	r10_bio->state = 0;	if (rw == READ) {		/*		 * read balancing logic:		 */		int disk = read_balance(conf, r10_bio);		int slot = r10_bio->read_slot;		if (disk < 0) {			raid_end_bio_io(r10_bio);			return 0;		}		mirror = conf->mirrors + disk;		read_bio = bio_clone(bio, GFP_NOIO);		r10_bio->devs[slot].bio = read_bio;		read_bio->bi_sector = r10_bio->devs[slot].addr +			mirror->rdev->data_offset;		read_bio->bi_bdev = mirror->rdev->bdev;		read_bio->bi_end_io = raid10_end_read_request;		read_bio->bi_rw = READ | do_sync;		read_bio->bi_private = r10_bio;		generic_make_request(read_bio);		return 0;	}	/*	 * WRITE:	 */	/* first select target devices under spinlock and	 * inc refcount on their rdev.  Record them by setting	 * bios[x] to bio	 */	raid10_find_phys(conf, r10_bio);	rcu_read_lock();	for (i = 0;  i < conf->copies; i++) {		int d = r10_bio->devs[i].devnum;		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);		if (rdev &&		    !test_bit(Faulty, &rdev->flags)) {			atomic_inc(&rdev->nr_pending);			r10_bio->devs[i].bio = bio;		} else {			r10_bio->devs[i].bio = NULL;			set_bit(R10BIO_Degraded, &r10_bio->state);		}	}	rcu_read_unlock();	atomic_set(&r10_bio->remaining, 0);	bio_list_init(&bl);	for (i = 0; i < conf->copies; i++) {		struct bio *mbio;		int d = r10_bio->devs[i].devnum;		if (!r10_bio->devs[i].bio)			continue;		mbio = bio_clone(bio, GFP_NOIO);		r10_bio->devs[i].bio = mbio;		mbio->bi_sector	= r10_bio->devs[i].addr+			conf->mirrors[d].rdev->data_offset;		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;		mbio->bi_end_io	= raid10_end_write_request;		mbio->bi_rw = WRITE | do_sync;		mbio->bi_private = r10_bio;		atomic_inc(&r10_bio->remaining);		bio_list_add(&bl, mbio);	}	if (unlikely(!atomic_read(&r10_bio->remaining))) {		/* the array is dead */		md_write_end(mddev);		raid_end_bio_io(r10_bio);		return 0;	}	bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);	spin_lock_irqsave(&conf->device_lock, flags);	bio_list_merge(&conf->pending_bio_list, &bl);	blk_plug_device(mddev->queue);	spin_unlock_irqrestore(&conf->device_lock, flags);	if (do_sync)		md_wakeup_thread(mddev->thread);	return 0;}static void status(struct seq_file *seq, mddev_t *mddev){	conf_t *conf = mddev_to_conf(mddev);	int i;	if (conf->near_copies < conf->raid_disks)		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);	if (conf->near_copies > 1)		seq_printf(seq, " %d near-copies", conf->near_copies);	if (conf->far_copies > 1) {		if (conf->far_offset)			seq_printf(seq, " %d offset-copies", conf->far_copies);		else			seq_printf(seq, " %d far-copies", conf->far_copies);	}	seq_printf(seq, " [%d/%d] [", conf->raid_disks,					conf->raid_disks - mddev->degraded);	for (i = 0; i < conf->raid_disks; i++)		seq_printf(seq, "%s",			      conf->mirrors[i].rdev &&			      test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");	seq_printf(seq, "]");}static void error(mddev_t *mddev, mdk_rdev_t *rdev){	char b[BDEVNAME_SIZE];	conf_t *conf = mddev_to_conf(mddev);	/*	 * If it is not operational, then we have already marked it as dead	 * else if it is the last working disks, ignore the error, let the	 * next level up know.	 * else mark the drive as failed	 */	if (test_bit(In_sync, &rdev->flags)	    && conf->raid_disks-mddev->degraded == 1)		/*		 * Don't fail the drive, just return an IO error.		 * The test should really be more sophisticated than		 * "working_disks == 1", but it isn't critical, and		 * can wait until we do more sophisticated "is the drive		 * really dead" tests...		 */		return;	if (test_and_clear_bit(In_sync, &rdev->flags)) {		unsigned long flags;		spin_lock_irqsave(&conf->device_lock, flags);		mddev->degraded++;		spin_unlock_irqrestore(&conf->device_lock, flags);		/*		 * if recovery is running, make sure it aborts.		 */		set_bit(MD_RECOVERY_ERR, &mddev->recovery);	}	set_bit(Faulty, &rdev->flags);	set_bit(MD_CHANGE_DEVS, &mddev->flags);	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"		"	Operation continuing on %d devices\n",		bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);}static void print_conf(conf_t *conf){	int i;	mirror_info_t *tmp;	printk("RAID10 conf printout:\n");	if (!conf) {		printk("(!conf)\n");		return;	}	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,		conf->raid_disks);	for (i = 0; i < conf->raid_disks; i++) {		char b[BDEVNAME_SIZE];		tmp = conf->mirrors + i;		if (tmp->rdev)			printk(" disk %d, wo:%d, o:%d, dev:%s\n",				i, !test_bit(In_sync, &tmp->rdev->flags),			        !test_bit(Faulty, &tmp->rdev->flags),				bdevname(tmp->rdev->bdev,b));	}}static void close_sync(conf_t *conf){	wait_barrier(conf);	allow_barrier(conf);	mempool_destroy(conf->r10buf_pool);	conf->r10buf_pool = NULL;}/* check if there are enough drives for * every block to appear on atleast one */static int enough(conf_t *conf){	int first = 0;	do {		int n = conf->copies;		int cnt = 0;		while (n--) {			if (conf->mirrors[first].rdev)				cnt++;			first = (first+1) % conf->raid_disks;		}		if (cnt == 0)			return 0;	} while (first != 0);	return 1;}static int raid10_spare_active(mddev_t *mddev){	int i;	conf_t *conf = mddev->private;	mirror_info_t *tmp;	/*	 * Find all non-in_sync disks within the RAID10 configuration	 * and mark them in_sync	 */	for (i = 0; i < conf->raid_disks; i++) {		tmp = conf->mirrors + i;		if (tmp->rdev		    && !test_bit(Faulty, &tmp->rdev->flags)		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {			unsigned long flags;			spin_lock_irqsave(&conf->device_lock, flags);			mddev->degraded--;			spin_unlock_irqrestore(&conf->device_lock, flags);		}	}	print_conf(conf);	return 0;}static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev){	conf_t *conf = mddev->private;	int found = 0;	int mirror;	mirror_info_t *p;	if (mddev->recovery_cp < MaxSector)		/* only hot-add to in-sync arrays, as recovery is		 * very different from resync		 */		return 0;	if (!enough(conf))		return 0;	if (rdev->saved_raid_disk >= 0 &&	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)		mirror = rdev->saved_raid_disk;	else		mirror = 0;	for ( ; mirror < mddev->raid_disks; mirror++)		if ( !(p=conf->mirrors+mirror)->rdev) {			blk_queue_stack_limits(mddev->queue,					       rdev->bdev->bd_disk->queue);			/* as we don't honour merge_bvec_fn, we must never risk			 * violating it, so limit ->max_sector to one PAGE, as			 * a one page request is never in violation.			 */			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&			    mddev->queue->max_sectors > (PAGE_SIZE>>9))				mddev->queue->max_sectors = (PAGE_SIZE>>9);			p->head_position = 0;			rdev->raid_disk = mirror;			found = 1;			if (rdev->saved_raid_disk != mirror)				conf->fullsync = 1;			rcu_assign_pointer(p->rdev, rdev);			break;		}	print_conf(conf);	return found;}static int raid10_remove_disk(mddev_t *mddev, int number){	conf_t *conf = mddev->private;	int err = 0;	mdk_rdev_t *rdev;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?