raid1.c

来自「linux 内核源代码」· C语言 代码 · 共 2,200 行 · 第 1/4 页

C
2,200
字号
			blk_unplug(r_queue);			rdev_dec_pending(rdev, mddev);			rcu_read_lock();		}	}	rcu_read_unlock();}static void raid1_unplug(struct request_queue *q){	mddev_t *mddev = q->queuedata;	unplug_slaves(mddev);	md_wakeup_thread(mddev->thread);}static int raid1_congested(void *data, int bits){	mddev_t *mddev = data;	conf_t *conf = mddev_to_conf(mddev);	int i, ret = 0;	rcu_read_lock();	for (i = 0; i < mddev->raid_disks; i++) {		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);		if (rdev && !test_bit(Faulty, &rdev->flags)) {			struct request_queue *q = bdev_get_queue(rdev->bdev);			/* Note the '|| 1' - when read_balance prefers			 * non-congested targets, it can be removed			 */			if ((bits & (1<<BDI_write_congested)) || 1)				ret |= bdi_congested(&q->backing_dev_info, bits);			else				ret &= bdi_congested(&q->backing_dev_info, bits);		}	}	rcu_read_unlock();	return ret;}/* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. * To do this we raise a 'barrier'. * The 'barrier' is a counter that can be raised multiple times * to count how many activities are happening which preclude * normal IO. * We can only raise the barrier if there is no pending IO. * i.e. if nr_pending == 0. * We choose only to raise the barrier if no-one is waiting for the * barrier to go down.  This means that as soon as an IO request * is ready, no other operations which require a barrier will start * until the IO request has had a chance. * * So: regular IO calls 'wait_barrier'.  When that returns there *    is no backgroup IO happening,  It must arrange to call *    allow_barrier when it has finished its IO. * backgroup IO calls must call raise_barrier.  Once that returns *    there is no normal IO happeing.  It must arrange to call *    lower_barrier when the particular background IO completes. */#define RESYNC_DEPTH 32static void raise_barrier(conf_t *conf){	spin_lock_irq(&conf->resync_lock);	/* Wait until no block IO is waiting */	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,			    conf->resync_lock,			    raid1_unplug(conf->mddev->queue));	/* block any new IO from starting */	conf->barrier++;	/* No wait for all pending IO to complete */	wait_event_lock_irq(conf->wait_barrier,			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,			    conf->resync_lock,			    raid1_unplug(conf->mddev->queue));	spin_unlock_irq(&conf->resync_lock);}static void lower_barrier(conf_t *conf){	unsigned long flags;	spin_lock_irqsave(&conf->resync_lock, flags);	conf->barrier--;	spin_unlock_irqrestore(&conf->resync_lock, flags);	wake_up(&conf->wait_barrier);}static void wait_barrier(conf_t *conf){	spin_lock_irq(&conf->resync_lock);	if (conf->barrier) {		conf->nr_waiting++;		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,				    conf->resync_lock,				    raid1_unplug(conf->mddev->queue));		conf->nr_waiting--;	}	conf->nr_pending++;	spin_unlock_irq(&conf->resync_lock);}static void allow_barrier(conf_t *conf){	unsigned long flags;	spin_lock_irqsave(&conf->resync_lock, flags);	conf->nr_pending--;	spin_unlock_irqrestore(&conf->resync_lock, flags);	wake_up(&conf->wait_barrier);}static void freeze_array(conf_t *conf){	/* stop syncio and normal IO and wait for everything to	 * go quite.	 * We increment barrier and nr_waiting, and then	 * wait until barrier+nr_pending match nr_queued+2	 */	spin_lock_irq(&conf->resync_lock);	conf->barrier++;	conf->nr_waiting++;	wait_event_lock_irq(conf->wait_barrier,			    conf->barrier+conf->nr_pending == conf->nr_queued+2,			    conf->resync_lock,			    raid1_unplug(conf->mddev->queue));	spin_unlock_irq(&conf->resync_lock);}static void unfreeze_array(conf_t *conf){	/* reverse the effect of the freeze */	spin_lock_irq(&conf->resync_lock);	conf->barrier--;	conf->nr_waiting--;	wake_up(&conf->wait_barrier);	spin_unlock_irq(&conf->resync_lock);}/* duplicate the data pages for behind I/O */static struct page **alloc_behind_pages(struct bio *bio){	int i;	struct bio_vec *bvec;	struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),					GFP_NOIO);	if (unlikely(!pages))		goto do_sync_io;	bio_for_each_segment(bvec, bio, i) {		pages[i] = alloc_page(GFP_NOIO);		if (unlikely(!pages[i]))			goto do_sync_io;		memcpy(kmap(pages[i]) + bvec->bv_offset,			kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);		kunmap(pages[i]);		kunmap(bvec->bv_page);	}	return pages;do_sync_io:	if (pages)		for (i = 0; i < bio->bi_vcnt && pages[i]; i++)			put_page(pages[i]);	kfree(pages);	PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);	return NULL;}static int make_request(struct request_queue *q, struct bio * bio){	mddev_t *mddev = q->queuedata;	conf_t *conf = mddev_to_conf(mddev);	mirror_info_t *mirror;	r1bio_t *r1_bio;	struct bio *read_bio;	int i, targets = 0, disks;	mdk_rdev_t *rdev;	struct bitmap *bitmap = mddev->bitmap;	unsigned long flags;	struct bio_list bl;	struct page **behind_pages = NULL;	const int rw = bio_data_dir(bio);	const int do_sync = bio_sync(bio);	int do_barriers;	/*	 * Register the new request and wait if the reconstruction	 * thread has put up a bar for new requests.	 * Continue immediately if no resync is active currently.	 * We test barriers_work *after* md_write_start as md_write_start	 * may cause the first superblock write, and that will check out	 * if barriers work.	 */	md_write_start(mddev, bio); /* wait on superblock update early */	if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {		if (rw == WRITE)			md_write_end(mddev);		bio_endio(bio, -EOPNOTSUPP);		return 0;	}	wait_barrier(conf);	disk_stat_inc(mddev->gendisk, ios[rw]);	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));	/*	 * make_request() can abort the operation when READA is being	 * used and no empty request is available.	 *	 */	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);	r1_bio->master_bio = bio;	r1_bio->sectors = bio->bi_size >> 9;	r1_bio->state = 0;	r1_bio->mddev = mddev;	r1_bio->sector = bio->bi_sector;	if (rw == READ) {		/*		 * read balancing logic:		 */		int rdisk = read_balance(conf, r1_bio);		if (rdisk < 0) {			/* couldn't find anywhere to read from */			raid_end_bio_io(r1_bio);			return 0;		}		mirror = conf->mirrors + rdisk;		r1_bio->read_disk = rdisk;		read_bio = bio_clone(bio, GFP_NOIO);		r1_bio->bios[rdisk] = read_bio;		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;		read_bio->bi_bdev = mirror->rdev->bdev;		read_bio->bi_end_io = raid1_end_read_request;		read_bio->bi_rw = READ | do_sync;		read_bio->bi_private = r1_bio;		generic_make_request(read_bio);		return 0;	}	/*	 * WRITE:	 */	/* first select target devices under spinlock and	 * inc refcount on their rdev.  Record them by setting	 * bios[x] to bio	 */	disks = conf->raid_disks;#if 0	{ static int first=1;	if (first) printk("First Write sector %llu disks %d\n",			  (unsigned long long)r1_bio->sector, disks);	first = 0;	}#endif	rcu_read_lock();	for (i = 0;  i < disks; i++) {		if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&		    !test_bit(Faulty, &rdev->flags)) {			atomic_inc(&rdev->nr_pending);			if (test_bit(Faulty, &rdev->flags)) {				rdev_dec_pending(rdev, mddev);				r1_bio->bios[i] = NULL;			} else				r1_bio->bios[i] = bio;			targets++;		} else			r1_bio->bios[i] = NULL;	}	rcu_read_unlock();	BUG_ON(targets == 0); /* we never fail the last device */	if (targets < conf->raid_disks) {		/* array is degraded, we will not clear the bitmap		 * on I/O completion (see raid1_end_write_request) */		set_bit(R1BIO_Degraded, &r1_bio->state);	}	/* do behind I/O ? */	if (bitmap &&	    atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&	    (behind_pages = alloc_behind_pages(bio)) != NULL)		set_bit(R1BIO_BehindIO, &r1_bio->state);	atomic_set(&r1_bio->remaining, 0);	atomic_set(&r1_bio->behind_remaining, 0);	do_barriers = bio_barrier(bio);	if (do_barriers)		set_bit(R1BIO_Barrier, &r1_bio->state);	bio_list_init(&bl);	for (i = 0; i < disks; i++) {		struct bio *mbio;		if (!r1_bio->bios[i])			continue;		mbio = bio_clone(bio, GFP_NOIO);		r1_bio->bios[i] = mbio;		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;		mbio->bi_end_io	= raid1_end_write_request;		mbio->bi_rw = WRITE | do_barriers | do_sync;		mbio->bi_private = r1_bio;		if (behind_pages) {			struct bio_vec *bvec;			int j;			/* Yes, I really want the '__' version so that			 * we clear any unused pointer in the io_vec, rather			 * than leave them unchanged.  This is important			 * because when we come to free the pages, we won't			 * know the originial bi_idx, so we just free			 * them all			 */			__bio_for_each_segment(bvec, mbio, j, 0)				bvec->bv_page = behind_pages[j];			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))				atomic_inc(&r1_bio->behind_remaining);		}		atomic_inc(&r1_bio->remaining);		bio_list_add(&bl, mbio);	}	kfree(behind_pages); /* the behind pages are attached to the bios now */	bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,				test_bit(R1BIO_BehindIO, &r1_bio->state));	spin_lock_irqsave(&conf->device_lock, flags);	bio_list_merge(&conf->pending_bio_list, &bl);	bio_list_init(&bl);	blk_plug_device(mddev->queue);	spin_unlock_irqrestore(&conf->device_lock, flags);	if (do_sync)		md_wakeup_thread(mddev->thread);#if 0	while ((bio = bio_list_pop(&bl)) != NULL)		generic_make_request(bio);#endif	return 0;}static void status(struct seq_file *seq, mddev_t *mddev){	conf_t *conf = mddev_to_conf(mddev);	int i;	seq_printf(seq, " [%d/%d] [", conf->raid_disks,		   conf->raid_disks - mddev->degraded);	rcu_read_lock();	for (i = 0; i < conf->raid_disks; i++) {		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);		seq_printf(seq, "%s",			   rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");	}	rcu_read_unlock();	seq_printf(seq, "]");}static void error(mddev_t *mddev, mdk_rdev_t *rdev){	char b[BDEVNAME_SIZE];	conf_t *conf = mddev_to_conf(mddev);	/*	 * If it is not operational, then we have already marked it as dead	 * else if it is the last working disks, ignore the error, let the	 * next level up know.	 * else mark the drive as failed	 */	if (test_bit(In_sync, &rdev->flags)	    && (conf->raid_disks - mddev->degraded) == 1)		/*		 * Don't fail the drive, act as though we were just a		 * normal single drive		 */		return;	if (test_and_clear_bit(In_sync, &rdev->flags)) {		unsigned long flags;		spin_lock_irqsave(&conf->device_lock, flags);		mddev->degraded++;		set_bit(Faulty, &rdev->flags);		spin_unlock_irqrestore(&conf->device_lock, flags);		/*		 * if recovery is running, make sure it aborts.		 */		set_bit(MD_RECOVERY_ERR, &mddev->recovery);	} else		set_bit(Faulty, &rdev->flags);	set_bit(MD_CHANGE_DEVS, &mddev->flags);	printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"		"	Operation continuing on %d devices\n",		bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);}static void print_conf(conf_t *conf){	int i;	printk("RAID1 conf printout:\n");	if (!conf) {		printk("(!conf)\n");		return;	}	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,		conf->raid_disks);	rcu_read_lock();	for (i = 0; i < conf->raid_disks; i++) {		char b[BDEVNAME_SIZE];		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);		if (rdev)			printk(" disk %d, wo:%d, o:%d, dev:%s\n",			       i, !test_bit(In_sync, &rdev->flags),			       !test_bit(Faulty, &rdev->flags),			       bdevname(rdev->bdev,b));	}	rcu_read_unlock();}static void close_sync(conf_t *conf){	wait_barrier(conf);	allow_barrier(conf);	mempool_destroy(conf->r1buf_pool);	conf->r1buf_pool = NULL;}static int raid1_spare_active(mddev_t *mddev){	int i;	conf_t *conf = mddev->private;	/*	 * Find all failed disks within the RAID1 configuration 	 * and mark them readable.	 * Called under mddev lock, so rcu protection not needed.	 */	for (i = 0; i < conf->raid_disks; i++) {		mdk_rdev_t *rdev = conf->mirrors[i].rdev;		if (rdev		    && !test_bit(Faulty, &rdev->flags)		    && !test_and_set_bit(In_sync, &rdev->flags)) {			unsigned long flags;			spin_lock_irqsave(&conf->device_lock, flags);			mddev->degraded--;			spin_unlock_irqrestore(&conf->device_lock, flags);		}	}	print_conf(conf);	return 0;}static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev){	conf_t *conf = mddev->private;	int found = 0;	int mirror = 0;	mirror_info_t *p;	for (mirror=0; mirror < mddev->raid_disks; mirror++)		if ( !(p=conf->mirrors+mirror)->rdev) {			blk_queue_stack_limits(mddev->queue,					       rdev->bdev->bd_disk->queue);			/* as we don't honour merge_bvec_fn, we must never risk			 * violating it, so limit ->max_sector to one PAGE, as			 * a one page request is never in violation.			 */			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&			    mddev->queue->max_sectors > (PAGE_SIZE>>9))				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);			p->head_position = 0;			rdev->raid_disk = mirror;			found = 1;			/* As all devices are equivalent, we don't need a full recovery			 * if this was recently any drive of the array			 */			if (rdev->saved_raid_disk < 0)				conf->fullsync = 1;			rcu_assign_pointer(p->rdev, rdev);			break;		}	print_conf(conf);	return found;}static int raid1_remove_disk(mddev_t *mddev, int number){	conf_t *conf = mddev->private;	int err = 0;	mdk_rdev_t *rdev;	mirror_info_t *p = conf->mirrors+ number;	print_conf(conf);	rdev = p->rdev;	if (rdev) {		if (test_bit(In_sync, &rdev->flags) ||		    atomic_read(&rdev->nr_pending)) {			err = -EBUSY;			goto abort;		}		p->rdev = NULL;		synchronize_rcu();		if (atomic_read(&rdev->nr_pending)) {			/* lost the race, try later */			err = -EBUSY;			p->rdev = rdev;		}	}abort:	print_conf(conf);	return err;}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?