raid10.c

来自「linux 内核源代码」· C语言 代码 · 共 2,188 行 · 第 1/4 页

C
2,188
字号
/* * raid10.c : Multiple Devices driver for Linux * * Copyright (C) 2000-2004 Neil Brown * * RAID-10 support for md. * * Base on code in raid1.c.  See raid1.c for futher copyright information. * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include "dm-bio-list.h"#include <linux/raid/raid10.h>#include <linux/raid/bitmap.h>/* * RAID10 provides a combination of RAID0 and RAID1 functionality. * The layout of data is defined by *    chunk_size *    raid_disks *    near_copies (stored in low byte of layout) *    far_copies (stored in second byte of layout) *    far_offset (stored in bit 16 of layout ) * * The data to be stored is divided into chunks using chunksize. * Each device is divided into far_copies sections. * In each section, chunks are laid out in a style similar to raid0, but * near_copies copies of each chunk is stored (each on a different drive). * The starting device for each section is offset near_copies from the starting * device of the previous section. * Thus they are (near_copies*far_copies) of each chunk, and each is on a different * drive. * near_copies and far_copies must be at least one, and their product is at most * raid_disks. * * If far_offset is true, then the far_copies are handled a bit differently. * The copies are still in different stripes, but instead of be very far apart * on disk, there are adjacent stripes. *//* * Number of guaranteed r10bios in case of extreme VM load: */#define	NR_RAID10_BIOS 256static void unplug_slaves(mddev_t *mddev);static void allow_barrier(conf_t *conf);static void lower_barrier(conf_t *conf);static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data){	conf_t *conf = data;	r10bio_t *r10_bio;	int size = offsetof(struct r10bio_s, devs[conf->copies]);	/* allocate a r10bio with room for raid_disks entries in the bios array */	r10_bio = kzalloc(size, gfp_flags);	if (!r10_bio)		unplug_slaves(conf->mddev);	return r10_bio;}static void r10bio_pool_free(void *r10_bio, void *data){	kfree(r10_bio);}#define RESYNC_BLOCK_SIZE (64*1024)//#define RESYNC_BLOCK_SIZE PAGE_SIZE#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)#define RESYNC_WINDOW (2048*1024)/* * When performing a resync, we need to read and compare, so * we need as many pages are there are copies. * When performing a recovery, we need 2 bios, one for read, * one for write (we recover only one drive per r10buf) * */static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data){	conf_t *conf = data;	struct page *page;	r10bio_t *r10_bio;	struct bio *bio;	int i, j;	int nalloc;	r10_bio = r10bio_pool_alloc(gfp_flags, conf);	if (!r10_bio) {		unplug_slaves(conf->mddev);		return NULL;	}	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))		nalloc = conf->copies; /* resync */	else		nalloc = 2; /* recovery */	/*	 * Allocate bios.	 */	for (j = nalloc ; j-- ; ) {		bio = bio_alloc(gfp_flags, RESYNC_PAGES);		if (!bio)			goto out_free_bio;		r10_bio->devs[j].bio = bio;	}	/*	 * Allocate RESYNC_PAGES data pages and attach them	 * where needed.	 */	for (j = 0 ; j < nalloc; j++) {		bio = r10_bio->devs[j].bio;		for (i = 0; i < RESYNC_PAGES; i++) {			page = alloc_page(gfp_flags);			if (unlikely(!page))				goto out_free_pages;			bio->bi_io_vec[i].bv_page = page;		}	}	return r10_bio;out_free_pages:	for ( ; i > 0 ; i--)		safe_put_page(bio->bi_io_vec[i-1].bv_page);	while (j--)		for (i = 0; i < RESYNC_PAGES ; i++)			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);	j = -1;out_free_bio:	while ( ++j < nalloc )		bio_put(r10_bio->devs[j].bio);	r10bio_pool_free(r10_bio, conf);	return NULL;}static void r10buf_pool_free(void *__r10_bio, void *data){	int i;	conf_t *conf = data;	r10bio_t *r10bio = __r10_bio;	int j;	for (j=0; j < conf->copies; j++) {		struct bio *bio = r10bio->devs[j].bio;		if (bio) {			for (i = 0; i < RESYNC_PAGES; i++) {				safe_put_page(bio->bi_io_vec[i].bv_page);				bio->bi_io_vec[i].bv_page = NULL;			}			bio_put(bio);		}	}	r10bio_pool_free(r10bio, conf);}static void put_all_bios(conf_t *conf, r10bio_t *r10_bio){	int i;	for (i = 0; i < conf->copies; i++) {		struct bio **bio = & r10_bio->devs[i].bio;		if (*bio && *bio != IO_BLOCKED)			bio_put(*bio);		*bio = NULL;	}}static void free_r10bio(r10bio_t *r10_bio){	conf_t *conf = mddev_to_conf(r10_bio->mddev);	/*	 * Wake up any possible resync thread that waits for the device	 * to go idle.	 */	allow_barrier(conf);	put_all_bios(conf, r10_bio);	mempool_free(r10_bio, conf->r10bio_pool);}static void put_buf(r10bio_t *r10_bio){	conf_t *conf = mddev_to_conf(r10_bio->mddev);	mempool_free(r10_bio, conf->r10buf_pool);	lower_barrier(conf);}static void reschedule_retry(r10bio_t *r10_bio){	unsigned long flags;	mddev_t *mddev = r10_bio->mddev;	conf_t *conf = mddev_to_conf(mddev);	spin_lock_irqsave(&conf->device_lock, flags);	list_add(&r10_bio->retry_list, &conf->retry_list);	conf->nr_queued ++;	spin_unlock_irqrestore(&conf->device_lock, flags);	md_wakeup_thread(mddev->thread);}/* * raid_end_bio_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */static void raid_end_bio_io(r10bio_t *r10_bio){	struct bio *bio = r10_bio->master_bio;	bio_endio(bio,		test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);	free_r10bio(r10_bio);}/* * Update disk head position estimator based on IRQ completion info. */static inline void update_head_pos(int slot, r10bio_t *r10_bio){	conf_t *conf = mddev_to_conf(r10_bio->mddev);	conf->mirrors[r10_bio->devs[slot].devnum].head_position =		r10_bio->devs[slot].addr + (r10_bio->sectors);}static void raid10_end_read_request(struct bio *bio, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);	int slot, dev;	conf_t *conf = mddev_to_conf(r10_bio->mddev);	slot = r10_bio->read_slot;	dev = r10_bio->devs[slot].devnum;	/*	 * this branch is our 'one mirror IO has finished' event handler:	 */	update_head_pos(slot, r10_bio);	if (uptodate) {		/*		 * Set R10BIO_Uptodate in our master bio, so that		 * we will return a good error code to the higher		 * levels even if IO on some other mirrored buffer fails.		 *		 * The 'master' represents the composite IO operation to		 * user-side. So if something waits for IO, then it will		 * wait for the 'master' bio.		 */		set_bit(R10BIO_Uptodate, &r10_bio->state);		raid_end_bio_io(r10_bio);	} else {		/*		 * oops, read error:		 */		char b[BDEVNAME_SIZE];		if (printk_ratelimit())			printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);		reschedule_retry(r10_bio);	}	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);}static void raid10_end_write_request(struct bio *bio, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);	int slot, dev;	conf_t *conf = mddev_to_conf(r10_bio->mddev);	for (slot = 0; slot < conf->copies; slot++)		if (r10_bio->devs[slot].bio == bio)			break;	dev = r10_bio->devs[slot].devnum;	/*	 * this branch is our 'one mirror IO has finished' event handler:	 */	if (!uptodate) {		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);		/* an I/O failed, we can't clear the bitmap */		set_bit(R10BIO_Degraded, &r10_bio->state);	} else		/*		 * Set R10BIO_Uptodate in our master bio, so that		 * we will return a good error code for to the higher		 * levels even if IO on some other mirrored buffer fails.		 *		 * The 'master' represents the composite IO operation to		 * user-side. So if something waits for IO, then it will		 * wait for the 'master' bio.		 */		set_bit(R10BIO_Uptodate, &r10_bio->state);	update_head_pos(slot, r10_bio);	/*	 *	 * Let's see if all mirrored write operations have finished	 * already.	 */	if (atomic_dec_and_test(&r10_bio->remaining)) {		/* clear the bitmap if all writes complete successfully */		bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,				r10_bio->sectors,				!test_bit(R10BIO_Degraded, &r10_bio->state),				0);		md_write_end(r10_bio->mddev);		raid_end_bio_io(r10_bio);	}	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);}/* * RAID10 layout manager * Aswell as the chunksize and raid_disks count, there are two * parameters: near_copies and far_copies. * near_copies * far_copies must be <= raid_disks. * Normally one of these will be 1. * If both are 1, we get raid0. * If near_copies == raid_disks, we get raid1. * * Chunks are layed out in raid0 style with near_copies copies of the * first chunk, followed by near_copies copies of the next chunk and * so on. * If far_copies > 1, then after 1/far_copies of the array has been assigned * as described above, we start again with a device offset of near_copies. * So we effectively have another copy of the whole array further down all * the drives, but with blocks on different drives. * With this layout, and block is never stored twice on the one device. * * raid10_find_phys finds the sector offset of a given virtual sector * on each device that it is on. * * raid10_find_virt does the reverse mapping, from a device and a * sector offset to a virtual address */static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio){	int n,f;	sector_t sector;	sector_t chunk;	sector_t stripe;	int dev;	int slot = 0;	/* now calculate first sector/dev */	chunk = r10bio->sector >> conf->chunk_shift;	sector = r10bio->sector & conf->chunk_mask;	chunk *= conf->near_copies;	stripe = chunk;	dev = sector_div(stripe, conf->raid_disks);	if (conf->far_offset)		stripe *= conf->far_copies;	sector += stripe << conf->chunk_shift;	/* and calculate all the others */	for (n=0; n < conf->near_copies; n++) {		int d = dev;		sector_t s = sector;		r10bio->devs[slot].addr = sector;		r10bio->devs[slot].devnum = d;		slot++;		for (f = 1; f < conf->far_copies; f++) {			d += conf->near_copies;			if (d >= conf->raid_disks)				d -= conf->raid_disks;			s += conf->stride;			r10bio->devs[slot].devnum = d;			r10bio->devs[slot].addr = s;			slot++;		}		dev++;		if (dev >= conf->raid_disks) {			dev = 0;			sector += (conf->chunk_mask + 1);		}	}	BUG_ON(slot != conf->copies);}static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev){	sector_t offset, chunk, vchunk;	offset = sector & conf->chunk_mask;	if (conf->far_offset) {		int fc;		chunk = sector >> conf->chunk_shift;		fc = sector_div(chunk, conf->far_copies);		dev -= fc * conf->near_copies;		if (dev < 0)			dev += conf->raid_disks;	} else {		while (sector >= conf->stride) {			sector -= conf->stride;			if (dev < conf->near_copies)				dev += conf->raid_disks - conf->near_copies;			else				dev -= conf->near_copies;		}		chunk = sector >> conf->chunk_shift;	}	vchunk = chunk * conf->raid_disks + dev;	sector_div(vchunk, conf->near_copies);	return (vchunk << conf->chunk_shift) + offset;}/** *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged *	@q: request queue *	@bio: the buffer head that's been built up so far *	@biovec: the request that could be merged to it. * *	Return amount of bytes we can accept at this offset *      If near_copies == raid_disk, there are no striping issues, *      but in that case, the function isn't called at all. */static int raid10_mergeable_bvec(struct request_queue *q, struct bio *bio,				struct bio_vec *bio_vec){	mddev_t *mddev = q->queuedata;	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);	int max;	unsigned int chunk_sectors = mddev->chunk_size >> 9;	unsigned int bio_sectors = bio->bi_size >> 9;	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;	if (max < 0) max = 0; /* bio_add cannot handle a negative return */	if (max <= bio_vec->bv_len && bio_sectors == 0)		return bio_vec->bv_len;	else		return max;}/* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented. *//* * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry. */static int read_balance(conf_t *conf, r10bio_t *r10_bio){	const unsigned long this_sector = r10_bio->sector;	int disk, slot, nslot;	const int sectors = r10_bio->sectors;	sector_t new_distance, current_distance;	mdk_rdev_t *rdev;	raid10_find_phys(conf, r10_bio);	rcu_read_lock();	/*	 * Check if we can balance. We can balance on the whole	 * device if no resync is going on (recovery is ok), or below	 * the resync window. We take the first readable disk when	 * above the resync window.	 */	if (conf->mddev->recovery_cp < MaxSector	    && (this_sector + sectors >= conf->next_resync)) {		/* make sure that disk is operational */		slot = 0;		disk = r10_bio->devs[slot].devnum;		while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||		       r10_bio->devs[slot].bio == IO_BLOCKED ||		       !test_bit(In_sync, &rdev->flags)) {			slot++;			if (slot == conf->copies) {				slot = 0;				disk = -1;				break;			}			disk = r10_bio->devs[slot].devnum;		}		goto rb_out;	}	/* make sure the disk is operational */	slot = 0;	disk = r10_bio->devs[slot].devnum;	while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||	       r10_bio->devs[slot].bio == IO_BLOCKED ||	       !test_bit(In_sync, &rdev->flags)) {		slot ++;		if (slot == conf->copies) {			disk = -1;			goto rb_out;		}		disk = r10_bio->devs[slot].devnum;	}	current_distance = abs(r10_bio->devs[slot].addr -			       conf->mirrors[disk].head_position);	/* Find the disk whose head is closest */	for (nslot = slot; nslot < conf->copies; nslot++) {		int ndisk = r10_bio->devs[nslot].devnum;		if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||		    r10_bio->devs[nslot].bio == IO_BLOCKED ||

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?