raid10.c

来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 1,781 行 · 第 1/4 页

C
1,781
字号
/* * raid10.c : Multiple Devices driver for Linux * * Copyright (C) 2000-2004 Neil Brown * * RAID-10 support for md. * * Base on code in raid1.c.  See raid1.c for futher copyright information. * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <linux/raid/raid10.h>/* * RAID10 provides a combination of RAID0 and RAID1 functionality. * The layout of data is defined by *    chunk_size *    raid_disks *    near_copies (stored in low byte of layout) *    far_copies (stored in second byte of layout) * * The data to be stored is divided into chunks using chunksize. * Each device is divided into far_copies sections. * In each section, chunks are laid out in a style similar to raid0, but * near_copies copies of each chunk is stored (each on a different drive). * The starting device for each section is offset near_copies from the starting * device of the previous section. * Thus there are (near_copies*far_copies) of each chunk, and each is on a different * drive. * near_copies and far_copies must be at least one, and there product is at most * raid_disks. *//* * Number of guaranteed r10bios in case of extreme VM load: */#define	NR_RAID10_BIOS 256static void unplug_slaves(mddev_t *mddev);static void * r10bio_pool_alloc(int gfp_flags, void *data){	conf_t *conf = data;	r10bio_t *r10_bio;	int size = offsetof(struct r10bio_s, devs[conf->copies]);	/* allocate a r10bio with room for raid_disks entries in the bios array */	r10_bio = kmalloc(size, gfp_flags);	if (r10_bio)		memset(r10_bio, 0, size);	else		unplug_slaves(conf->mddev);	return r10_bio;}static void r10bio_pool_free(void *r10_bio, void *data){	kfree(r10_bio);}#define RESYNC_BLOCK_SIZE (64*1024)//#define RESYNC_BLOCK_SIZE PAGE_SIZE#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)#define RESYNC_WINDOW (2048*1024)/* * When performing a resync, we need to read and compare, so * we need as many pages are there are copies. * When performing a recovery, we need 2 bios, one for read, * one for write (we recover only one drive per r10buf) * */static void * r10buf_pool_alloc(int gfp_flags, void *data){	conf_t *conf = data;	struct page *page;	r10bio_t *r10_bio;	struct bio *bio;	int i, j;	int nalloc;	r10_bio = r10bio_pool_alloc(gfp_flags, conf);	if (!r10_bio) {		unplug_slaves(conf->mddev);		return NULL;	}	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))		nalloc = conf->copies; /* resync */	else		nalloc = 2; /* recovery */	/*	 * Allocate bios.	 */	for (j = nalloc ; j-- ; ) {		bio = bio_alloc(gfp_flags, RESYNC_PAGES);		if (!bio)			goto out_free_bio;		r10_bio->devs[j].bio = bio;	}	/*	 * Allocate RESYNC_PAGES data pages and attach them	 * where needed.	 */	for (j = 0 ; j < nalloc; j++) {		bio = r10_bio->devs[j].bio;		for (i = 0; i < RESYNC_PAGES; i++) {			page = alloc_page(gfp_flags);			if (unlikely(!page))				goto out_free_pages;			bio->bi_io_vec[i].bv_page = page;		}	}	return r10_bio;out_free_pages:	for ( ; i > 0 ; i--)		__free_page(bio->bi_io_vec[i-1].bv_page);	while (j--)		for (i = 0; i < RESYNC_PAGES ; i++)			__free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);	j = -1;out_free_bio:	while ( ++j < nalloc )		bio_put(r10_bio->devs[j].bio);	r10bio_pool_free(r10_bio, conf);	return NULL;}static void r10buf_pool_free(void *__r10_bio, void *data){	int i;	conf_t *conf = data;	r10bio_t *r10bio = __r10_bio;	int j;	for (j=0; j < conf->copies; j++) {		struct bio *bio = r10bio->devs[j].bio;		if (bio) {			for (i = 0; i < RESYNC_PAGES; i++) {				__free_page(bio->bi_io_vec[i].bv_page);				bio->bi_io_vec[i].bv_page = NULL;			}			bio_put(bio);		}	}	r10bio_pool_free(r10bio, conf);}static void put_all_bios(conf_t *conf, r10bio_t *r10_bio){	int i;	for (i = 0; i < conf->copies; i++) {		struct bio **bio = & r10_bio->devs[i].bio;		if (*bio)			bio_put(*bio);		*bio = NULL;	}}static inline void free_r10bio(r10bio_t *r10_bio){	unsigned long flags;	conf_t *conf = mddev_to_conf(r10_bio->mddev);	/*	 * Wake up any possible resync thread that waits for the device	 * to go idle.	 */	spin_lock_irqsave(&conf->resync_lock, flags);	if (!--conf->nr_pending) {		wake_up(&conf->wait_idle);		wake_up(&conf->wait_resume);	}	spin_unlock_irqrestore(&conf->resync_lock, flags);	put_all_bios(conf, r10_bio);	mempool_free(r10_bio, conf->r10bio_pool);}static inline void put_buf(r10bio_t *r10_bio){	conf_t *conf = mddev_to_conf(r10_bio->mddev);	unsigned long flags;	mempool_free(r10_bio, conf->r10buf_pool);	spin_lock_irqsave(&conf->resync_lock, flags);	if (!conf->barrier)		BUG();	--conf->barrier;	wake_up(&conf->wait_resume);	wake_up(&conf->wait_idle);	if (!--conf->nr_pending) {		wake_up(&conf->wait_idle);		wake_up(&conf->wait_resume);	}	spin_unlock_irqrestore(&conf->resync_lock, flags);}static void reschedule_retry(r10bio_t *r10_bio){	unsigned long flags;	mddev_t *mddev = r10_bio->mddev;	conf_t *conf = mddev_to_conf(mddev);	spin_lock_irqsave(&conf->device_lock, flags);	list_add(&r10_bio->retry_list, &conf->retry_list);	spin_unlock_irqrestore(&conf->device_lock, flags);	md_wakeup_thread(mddev->thread);}/* * raid_end_bio_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */static void raid_end_bio_io(r10bio_t *r10_bio){	struct bio *bio = r10_bio->master_bio;	bio_endio(bio, bio->bi_size,		test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);	free_r10bio(r10_bio);}/* * Update disk head position estimator based on IRQ completion info. */static inline void update_head_pos(int slot, r10bio_t *r10_bio){	conf_t *conf = mddev_to_conf(r10_bio->mddev);	conf->mirrors[r10_bio->devs[slot].devnum].head_position =		r10_bio->devs[slot].addr + (r10_bio->sectors);}static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);	int slot, dev;	conf_t *conf = mddev_to_conf(r10_bio->mddev);	if (bio->bi_size)		return 1;	slot = r10_bio->read_slot;	dev = r10_bio->devs[slot].devnum;	/*	 * this branch is our 'one mirror IO has finished' event handler:	 */	if (!uptodate)		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);	else		/*		 * Set R10BIO_Uptodate in our master bio, so that		 * we will return a good error code to the higher		 * levels even if IO on some other mirrored buffer fails.		 *		 * The 'master' represents the composite IO operation to		 * user-side. So if something waits for IO, then it will		 * wait for the 'master' bio.		 */		set_bit(R10BIO_Uptodate, &r10_bio->state);	update_head_pos(slot, r10_bio);	/*	 * we have only one bio on the read side	 */	if (uptodate)		raid_end_bio_io(r10_bio);	else {		/*		 * oops, read error:		 */		char b[BDEVNAME_SIZE];		if (printk_ratelimit())			printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);		reschedule_retry(r10_bio);	}	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);	return 0;}static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);	int slot, dev;	conf_t *conf = mddev_to_conf(r10_bio->mddev);	if (bio->bi_size)		return 1;	for (slot = 0; slot < conf->copies; slot++)		if (r10_bio->devs[slot].bio == bio)			break;	dev = r10_bio->devs[slot].devnum;	/*	 * this branch is our 'one mirror IO has finished' event handler:	 */	if (!uptodate)		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);	else		/*		 * Set R10BIO_Uptodate in our master bio, so that		 * we will return a good error code for to the higher		 * levels even if IO on some other mirrored buffer fails.		 *		 * The 'master' represents the composite IO operation to		 * user-side. So if something waits for IO, then it will		 * wait for the 'master' bio.		 */		set_bit(R10BIO_Uptodate, &r10_bio->state);	update_head_pos(slot, r10_bio);	/*	 *	 * Let's see if all mirrored write operations have finished	 * already.	 */	if (atomic_dec_and_test(&r10_bio->remaining)) {		md_write_end(r10_bio->mddev);		raid_end_bio_io(r10_bio);	}	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);	return 0;}/* * RAID10 layout manager * Aswell as the chunksize and raid_disks count, there are two * parameters: near_copies and far_copies. * near_copies * far_copies must be <= raid_disks. * Normally one of these will be 1. * If both are 1, we get raid0. * If near_copies == raid_disks, we get raid1. * * Chunks are layed out in raid0 style with near_copies copies of the * first chunk, followed by near_copies copies of the next chunk and * so on. * If far_copies > 1, then after 1/far_copies of the array has been assigned * as described above, we start again with a device offset of near_copies. * So we effectively have another copy of the whole array further down all * the drives, but with blocks on different drives. * With this layout, and block is never stored twice on the one device. * * raid10_find_phys finds the sector offset of a given virtual sector * on each device that it is on. If a block isn't on a device, * that entry in the array is set to MaxSector. * * raid10_find_virt does the reverse mapping, from a device and a * sector offset to a virtual address */static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio){	int n,f;	sector_t sector;	sector_t chunk;	sector_t stripe;	int dev;	int slot = 0;	/* now calculate first sector/dev */	chunk = r10bio->sector >> conf->chunk_shift;	sector = r10bio->sector & conf->chunk_mask;	chunk *= conf->near_copies;	stripe = chunk;	dev = sector_div(stripe, conf->raid_disks);	sector += stripe << conf->chunk_shift;	/* and calculate all the others */	for (n=0; n < conf->near_copies; n++) {		int d = dev;		sector_t s = sector;		r10bio->devs[slot].addr = sector;		r10bio->devs[slot].devnum = d;		slot++;		for (f = 1; f < conf->far_copies; f++) {			d += conf->near_copies;			if (d >= conf->raid_disks)				d -= conf->raid_disks;			s += conf->stride;			r10bio->devs[slot].devnum = d;			r10bio->devs[slot].addr = s;			slot++;		}		dev++;		if (dev >= conf->raid_disks) {			dev = 0;			sector += (conf->chunk_mask + 1);		}	}	BUG_ON(slot != conf->copies);}static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev){	sector_t offset, chunk, vchunk;	while (sector > conf->stride) {		sector -= conf->stride;		if (dev < conf->near_copies)			dev += conf->raid_disks - conf->near_copies;		else			dev -= conf->near_copies;	}	offset = sector & conf->chunk_mask;	chunk = sector >> conf->chunk_shift;	vchunk = chunk * conf->raid_disks + dev;	sector_div(vchunk, conf->near_copies);	return (vchunk << conf->chunk_shift) + offset;}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?