raid10.c

来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 1,781 行 · 第 1/4 页

C
1,781
字号
/** *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged *	@q: request queue *	@bio: the buffer head that's been built up so far *	@biovec: the request that could be merged to it. * *	Return amount of bytes we can accept at this offset *      If near_copies == raid_disk, there are no striping issues, *      but in that case, the function isn't called at all. */static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio,				struct bio_vec *bio_vec){	mddev_t *mddev = q->queuedata;	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);	int max;	unsigned int chunk_sectors = mddev->chunk_size >> 9;	unsigned int bio_sectors = bio->bi_size >> 9;	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;	if (max < 0) max = 0; /* bio_add cannot handle a negative return */	if (max <= bio_vec->bv_len && bio_sectors == 0)		return bio_vec->bv_len;	else		return max;}/* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented. *//* * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry. */static int read_balance(conf_t *conf, r10bio_t *r10_bio){	const unsigned long this_sector = r10_bio->sector;	int disk, slot, nslot;	const int sectors = r10_bio->sectors;	sector_t new_distance, current_distance;	raid10_find_phys(conf, r10_bio);	spin_lock_irq(&conf->device_lock);	/*	 * Check if we can balance. We can balance on the whole	 * device if no resync is going on, or below the resync window.	 * We take the first readable disk when above the resync window.	 */	if (conf->mddev->recovery_cp < MaxSector	    && (this_sector + sectors >= conf->next_resync)) {		/* make sure that disk is operational */		slot = 0;		disk = r10_bio->devs[slot].devnum;		while (!conf->mirrors[disk].rdev ||		       !conf->mirrors[disk].rdev->in_sync) {			slot++;			if (slot == conf->copies) {				slot = 0;				disk = -1;				break;			}			disk = r10_bio->devs[slot].devnum;		}		goto rb_out;	}	/* make sure the disk is operational */	slot = 0;	disk = r10_bio->devs[slot].devnum;	while (!conf->mirrors[disk].rdev ||	       !conf->mirrors[disk].rdev->in_sync) {		slot ++;		if (slot == conf->copies) {			disk = -1;			goto rb_out;		}		disk = r10_bio->devs[slot].devnum;	}	current_distance = abs(this_sector - conf->mirrors[disk].head_position);	/* Find the disk whose head is closest */	for (nslot = slot; nslot < conf->copies; nslot++) {		int ndisk = r10_bio->devs[nslot].devnum;		if (!conf->mirrors[ndisk].rdev ||		    !conf->mirrors[ndisk].rdev->in_sync)			continue;		if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) {			disk = ndisk;			slot = nslot;			break;		}		new_distance = abs(r10_bio->devs[nslot].addr -				   conf->mirrors[ndisk].head_position);		if (new_distance < current_distance) {			current_distance = new_distance;			disk = ndisk;			slot = nslot;		}	}rb_out:	r10_bio->read_slot = slot;/*	conf->next_seq_sect = this_sector + sectors;*/	if (disk >= 0 && conf->mirrors[disk].rdev)		atomic_inc(&conf->mirrors[disk].rdev->nr_pending);	spin_unlock_irq(&conf->device_lock);	return disk;}static void unplug_slaves(mddev_t *mddev){	conf_t *conf = mddev_to_conf(mddev);	int i;	unsigned long flags;	spin_lock_irqsave(&conf->device_lock, flags);	for (i=0; i<mddev->raid_disks; i++) {		mdk_rdev_t *rdev = conf->mirrors[i].rdev;		if (rdev && atomic_read(&rdev->nr_pending)) {			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);			atomic_inc(&rdev->nr_pending);			spin_unlock_irqrestore(&conf->device_lock, flags);			if (r_queue->unplug_fn)				r_queue->unplug_fn(r_queue);			spin_lock_irqsave(&conf->device_lock, flags);			atomic_dec(&rdev->nr_pending);		}	}	spin_unlock_irqrestore(&conf->device_lock, flags);}static void raid10_unplug(request_queue_t *q){	unplug_slaves(q->queuedata);}static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,			     sector_t *error_sector){	mddev_t *mddev = q->queuedata;	conf_t *conf = mddev_to_conf(mddev);	unsigned long flags;	int i, ret = 0;	spin_lock_irqsave(&conf->device_lock, flags);	for (i=0; i<mddev->raid_disks; i++) {		mdk_rdev_t *rdev = conf->mirrors[i].rdev;		if (rdev && !rdev->faulty) {			struct block_device *bdev = rdev->bdev;			request_queue_t *r_queue = bdev_get_queue(bdev);			if (r_queue->issue_flush_fn) {				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);				if (ret)					break;			}		}	}	spin_unlock_irqrestore(&conf->device_lock, flags);	return ret;}/* * Throttle resync depth, so that we can both get proper overlapping of * requests, but are still able to handle normal requests quickly. */#define RESYNC_DEPTH 32static void device_barrier(conf_t *conf, sector_t sect){	spin_lock_irq(&conf->resync_lock);	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),			    conf->resync_lock, unplug_slaves(conf->mddev));	if (!conf->barrier++) {		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,				    conf->resync_lock, unplug_slaves(conf->mddev));		if (conf->nr_pending)			BUG();	}	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,			    conf->resync_lock, unplug_slaves(conf->mddev));	conf->next_resync = sect;	spin_unlock_irq(&conf->resync_lock);}static int make_request(request_queue_t *q, struct bio * bio){	mddev_t *mddev = q->queuedata;	conf_t *conf = mddev_to_conf(mddev);	mirror_info_t *mirror;	r10bio_t *r10_bio;	struct bio *read_bio;	int i;	int chunk_sects = conf->chunk_mask + 1;	/* If this request crosses a chunk boundary, we need to	 * split it.  This will only happen for 1 PAGE (or less) requests.	 */	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)		      > chunk_sects &&		    conf->near_copies < conf->raid_disks)) {		struct bio_pair *bp;		/* Sanity check -- queue functions should prevent this happening */		if (bio->bi_vcnt != 1 ||		    bio->bi_idx != 0)			goto bad_map;		/* This is a one page bio that upper layers		 * refuse to split for us, so we need to split it.		 */		bp = bio_split(bio, bio_split_pool,			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );		if (make_request(q, &bp->bio1))			generic_make_request(&bp->bio1);		if (make_request(q, &bp->bio2))			generic_make_request(&bp->bio2);		bio_pair_release(bp);		return 0;	bad_map:		printk("raid10_make_request bug: can't convert block across chunks"		       " or bigger than %dk %llu %d\n", chunk_sects/2,		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);		bio_io_error(bio, bio->bi_size);		return 0;	}	/*	 * Register the new request and wait if the reconstruction	 * thread has put up a bar for new requests.	 * Continue immediately if no resync is active currently.	 */	spin_lock_irq(&conf->resync_lock);	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );	conf->nr_pending++;	spin_unlock_irq(&conf->resync_lock);	if (bio_data_dir(bio)==WRITE) {		disk_stat_inc(mddev->gendisk, writes);		disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));	} else {		disk_stat_inc(mddev->gendisk, reads);		disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));	}	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);	r10_bio->master_bio = bio;	r10_bio->sectors = bio->bi_size >> 9;	r10_bio->mddev = mddev;	r10_bio->sector = bio->bi_sector;	if (bio_data_dir(bio) == READ) {		/*		 * read balancing logic:		 */		int disk = read_balance(conf, r10_bio);		int slot = r10_bio->read_slot;		if (disk < 0) {			raid_end_bio_io(r10_bio);			return 0;		}		mirror = conf->mirrors + disk;		read_bio = bio_clone(bio, GFP_NOIO);		r10_bio->devs[slot].bio = read_bio;		read_bio->bi_sector = r10_bio->devs[slot].addr +			mirror->rdev->data_offset;		read_bio->bi_bdev = mirror->rdev->bdev;		read_bio->bi_end_io = raid10_end_read_request;		read_bio->bi_rw = READ;		read_bio->bi_private = r10_bio;		generic_make_request(read_bio);		return 0;	}	/*	 * WRITE:	 */	/* first select target devices under spinlock and	 * inc refcount on their rdev.  Record them by setting	 * bios[x] to bio	 */	raid10_find_phys(conf, r10_bio);	spin_lock_irq(&conf->device_lock);	for (i = 0;  i < conf->copies; i++) {		int d = r10_bio->devs[i].devnum;		if (conf->mirrors[d].rdev &&		    !conf->mirrors[d].rdev->faulty) {			atomic_inc(&conf->mirrors[d].rdev->nr_pending);			r10_bio->devs[i].bio = bio;		} else			r10_bio->devs[i].bio = NULL;	}	spin_unlock_irq(&conf->device_lock);	atomic_set(&r10_bio->remaining, 1);	md_write_start(mddev);	for (i = 0; i < conf->copies; i++) {		struct bio *mbio;		int d = r10_bio->devs[i].devnum;		if (!r10_bio->devs[i].bio)			continue;		mbio = bio_clone(bio, GFP_NOIO);		r10_bio->devs[i].bio = mbio;		mbio->bi_sector	= r10_bio->devs[i].addr+			conf->mirrors[d].rdev->data_offset;		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;		mbio->bi_end_io	= raid10_end_write_request;		mbio->bi_rw = WRITE;		mbio->bi_private = r10_bio;		atomic_inc(&r10_bio->remaining);		generic_make_request(mbio);	}	if (atomic_dec_and_test(&r10_bio->remaining)) {		md_write_end(mddev);		raid_end_bio_io(r10_bio);	}	return 0;}static void status(struct seq_file *seq, mddev_t *mddev){	conf_t *conf = mddev_to_conf(mddev);	int i;	if (conf->near_copies < conf->raid_disks)		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);	if (conf->near_copies > 1)		seq_printf(seq, " %d near-copies", conf->near_copies);	if (conf->far_copies > 1)		seq_printf(seq, " %d far-copies", conf->far_copies);	seq_printf(seq, " [%d/%d] [", conf->raid_disks,						conf->working_disks);	for (i = 0; i < conf->raid_disks; i++)		seq_printf(seq, "%s",			      conf->mirrors[i].rdev &&			      conf->mirrors[i].rdev->in_sync ? "U" : "_");	seq_printf(seq, "]");}static void error(mddev_t *mddev, mdk_rdev_t *rdev){	char b[BDEVNAME_SIZE];	conf_t *conf = mddev_to_conf(mddev);	/*	 * If it is not operational, then we have already marked it as dead	 * else if it is the last working disks, ignore the error, let the	 * next level up know.	 * else mark the drive as failed	 */	if (rdev->in_sync	    && conf->working_disks == 1)		/*		 * Don't fail the drive, just return an IO error.		 * The test should really be more sophisticated than		 * "working_disks == 1", but it isn't critical, and		 * can wait until we do more sophisticated "is the drive		 * really dead" tests...		 */		return;	if (rdev->in_sync) {		mddev->degraded++;		conf->working_disks--;		/*		 * if recovery is running, make sure it aborts.		 */		set_bit(MD_RECOVERY_ERR, &mddev->recovery);	}	rdev->in_sync = 0;	rdev->faulty = 1;	mddev->sb_dirty = 1;	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"		"	Operation continuing on %d devices\n",		bdevname(rdev->bdev,b), conf->working_disks);}static void print_conf(conf_t *conf){	int i;	mirror_info_t *tmp;	printk("RAID10 conf printout:\n");	if (!conf) {		printk("(!conf)\n");		return;	}	printk(" --- wd:%d rd:%d\n", conf->working_disks,		conf->raid_disks);	for (i = 0; i < conf->raid_disks; i++) {		char b[BDEVNAME_SIZE];		tmp = conf->mirrors + i;		if (tmp->rdev)			printk(" disk %d, wo:%d, o:%d, dev:%s\n",				i, !tmp->rdev->in_sync, !tmp->rdev->faulty,				bdevname(tmp->rdev->bdev,b));	}}static void close_sync(conf_t *conf){	spin_lock_irq(&conf->resync_lock);	wait_event_lock_irq(conf->wait_resume, !conf->barrier,			    conf->resync_lock, 	unplug_slaves(conf->mddev));	spin_unlock_irq(&conf->resync_lock);	if (conf->barrier) BUG();	if (waitqueue_active(&conf->wait_idle)) BUG();

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?