raid1.c

来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 1,426 行 · 第 1/3 页

C
1,426
字号
/* * raid1.c : Multiple Devices driver for Linux * * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat * * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman * * RAID-1 management functions. * * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 * * Fixes to reconstruction by Jakob 豷tergaard" <jakob@ostenfeld.dk> * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <linux/raid/raid1.h>/* * Number of guaranteed r1bios in case of extreme VM load: */#define	NR_RAID1_BIOS 256static mdk_personality_t raid1_personality;static spinlock_t retry_list_lock = SPIN_LOCK_UNLOCKED;static LIST_HEAD(retry_list_head);static void unplug_slaves(mddev_t *mddev);static void * r1bio_pool_alloc(int gfp_flags, void *data){	struct pool_info *pi = data;	r1bio_t *r1_bio;	int size = offsetof(r1bio_t, bios[pi->raid_disks]);	/* allocate a r1bio with room for raid_disks entries in the bios array */	r1_bio = kmalloc(size, gfp_flags);	if (r1_bio)		memset(r1_bio, 0, size);	else		unplug_slaves(pi->mddev);	return r1_bio;}static void r1bio_pool_free(void *r1_bio, void *data){	kfree(r1_bio);}#define RESYNC_BLOCK_SIZE (64*1024)//#define RESYNC_BLOCK_SIZE PAGE_SIZE#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)#define RESYNC_WINDOW (2048*1024)static void * r1buf_pool_alloc(int gfp_flags, void *data){	struct pool_info *pi = data;	struct page *page;	r1bio_t *r1_bio;	struct bio *bio;	int i, j;	r1_bio = r1bio_pool_alloc(gfp_flags, pi);	if (!r1_bio) {		unplug_slaves(pi->mddev);		return NULL;	}	/*	 * Allocate bios : 1 for reading, n-1 for writing	 */	for (j = pi->raid_disks ; j-- ; ) {		bio = bio_alloc(gfp_flags, RESYNC_PAGES);		if (!bio)			goto out_free_bio;		r1_bio->bios[j] = bio;	}	/*	 * Allocate RESYNC_PAGES data pages and attach them to	 * the first bio;	 */	bio = r1_bio->bios[0];	for (i = 0; i < RESYNC_PAGES; i++) {		page = alloc_page(gfp_flags);		if (unlikely(!page))			goto out_free_pages;		bio->bi_io_vec[i].bv_page = page;	}	r1_bio->master_bio = NULL;	return r1_bio;out_free_pages:	for ( ; i > 0 ; i--)		__free_page(bio->bi_io_vec[i-1].bv_page);out_free_bio:	while ( ++j < pi->raid_disks )		bio_put(r1_bio->bios[j]);	r1bio_pool_free(r1_bio, data);	return NULL;}static void r1buf_pool_free(void *__r1_bio, void *data){	struct pool_info *pi = data;	int i;	r1bio_t *r1bio = __r1_bio;	struct bio *bio = r1bio->bios[0];	for (i = 0; i < RESYNC_PAGES; i++) {		__free_page(bio->bi_io_vec[i].bv_page);		bio->bi_io_vec[i].bv_page = NULL;	}	for (i=0 ; i < pi->raid_disks; i++)		bio_put(r1bio->bios[i]);	r1bio_pool_free(r1bio, data);}static void put_all_bios(conf_t *conf, r1bio_t *r1_bio){	int i;	for (i = 0; i < conf->raid_disks; i++) {		struct bio **bio = r1_bio->bios + i;		if (*bio)			bio_put(*bio);		*bio = NULL;	}}static inline void free_r1bio(r1bio_t *r1_bio){	unsigned long flags;	conf_t *conf = mddev_to_conf(r1_bio->mddev);	/*	 * Wake up any possible resync thread that waits for the device	 * to go idle.	 */	spin_lock_irqsave(&conf->resync_lock, flags);	if (!--conf->nr_pending) {		wake_up(&conf->wait_idle);		wake_up(&conf->wait_resume);	}	spin_unlock_irqrestore(&conf->resync_lock, flags);	put_all_bios(conf, r1_bio);	mempool_free(r1_bio, conf->r1bio_pool);}static inline void put_buf(r1bio_t *r1_bio){	conf_t *conf = mddev_to_conf(r1_bio->mddev);	unsigned long flags;	mempool_free(r1_bio, conf->r1buf_pool);	spin_lock_irqsave(&conf->resync_lock, flags);	if (!conf->barrier)		BUG();	--conf->barrier;	wake_up(&conf->wait_resume);	wake_up(&conf->wait_idle);	if (!--conf->nr_pending) {		wake_up(&conf->wait_idle);		wake_up(&conf->wait_resume);	}	spin_unlock_irqrestore(&conf->resync_lock, flags);}static void reschedule_retry(r1bio_t *r1_bio){	unsigned long flags;	mddev_t *mddev = r1_bio->mddev;	spin_lock_irqsave(&retry_list_lock, flags);	list_add(&r1_bio->retry_list, &retry_list_head);	spin_unlock_irqrestore(&retry_list_lock, flags);	md_wakeup_thread(mddev->thread);}/* * raid_end_bio_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */static void raid_end_bio_io(r1bio_t *r1_bio){	struct bio *bio = r1_bio->master_bio;	bio_endio(bio, bio->bi_size,		test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);	free_r1bio(r1_bio);}/* * Update disk head position estimator based on IRQ completion info. */static inline void update_head_pos(int disk, r1bio_t *r1_bio){	conf_t *conf = mddev_to_conf(r1_bio->mddev);	conf->mirrors[disk].head_position =		r1_bio->sector + (r1_bio->sectors);}static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);	int mirror;	conf_t *conf = mddev_to_conf(r1_bio->mddev);	if (bio->bi_size)		return 1;		mirror = r1_bio->read_disk;	/*	 * this branch is our 'one mirror IO has finished' event handler:	 */	if (!uptodate)		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);	else		/*		 * Set R1BIO_Uptodate in our master bio, so that		 * we will return a good error code for to the higher		 * levels even if IO on some other mirrored buffer fails.		 *		 * The 'master' represents the composite IO operation to		 * user-side. So if something waits for IO, then it will		 * wait for the 'master' bio.		 */		set_bit(R1BIO_Uptodate, &r1_bio->state);	update_head_pos(mirror, r1_bio);	/*	 * we have only one bio on the read side	 */	if (uptodate)		raid_end_bio_io(r1_bio);	else {		/*		 * oops, read error:		 */		char b[BDEVNAME_SIZE];		if (printk_ratelimit())			printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);		reschedule_retry(r1_bio);	}	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);	return 0;}static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);	int mirror;	conf_t *conf = mddev_to_conf(r1_bio->mddev);	if (bio->bi_size)		return 1;	for (mirror = 0; mirror < conf->raid_disks; mirror++)		if (r1_bio->bios[mirror] == bio)			break;	/*	 * this branch is our 'one mirror IO has finished' event handler:	 */	if (!uptodate)		md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);	else		/*		 * Set R1BIO_Uptodate in our master bio, so that		 * we will return a good error code for to the higher		 * levels even if IO on some other mirrored buffer fails.		 *		 * The 'master' represents the composite IO operation to		 * user-side. So if something waits for IO, then it will		 * wait for the 'master' bio.		 */		set_bit(R1BIO_Uptodate, &r1_bio->state);	update_head_pos(mirror, r1_bio);	/*	 *	 * Let's see if all mirrored write operations have finished	 * already.	 */	if (atomic_dec_and_test(&r1_bio->remaining)) {		md_write_end(r1_bio->mddev);		raid_end_bio_io(r1_bio);	}	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);	return 0;}/* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented. */static int read_balance(conf_t *conf, r1bio_t *r1_bio){	const unsigned long this_sector = r1_bio->sector;	int new_disk = conf->last_used, disk = new_disk;	const int sectors = r1_bio->sectors;	sector_t new_distance, current_distance;	spin_lock_irq(&conf->device_lock);	/*	 * Check if it if we can balance. We can balance on the whole	 * device if no resync is going on, or below the resync window.	 * We take the first readable disk when above the resync window.	 */	if (conf->mddev->recovery_cp < MaxSector &&	    (this_sector + sectors >= conf->next_resync)) {		/* Choose the first operation device, for consistancy */		new_disk = 0;		while (!conf->mirrors[new_disk].rdev ||		       !conf->mirrors[new_disk].rdev->in_sync) {			new_disk++;			if (new_disk == conf->raid_disks) {				new_disk = -1;				break;			}		}		goto rb_out;	}	/* make sure the disk is operational */	while (!conf->mirrors[new_disk].rdev ||	       !conf->mirrors[new_disk].rdev->in_sync) {		if (new_disk <= 0)			new_disk = conf->raid_disks;		new_disk--;		if (new_disk == disk) {			new_disk = -1;			goto rb_out;		}	}	disk = new_disk;	/* now disk == new_disk == starting point for search */	/*	 * Don't change to another disk for sequential reads:	 */	if (conf->next_seq_sect == this_sector)		goto rb_out;	if (this_sector == conf->mirrors[new_disk].head_position)		goto rb_out;	current_distance = abs(this_sector - conf->mirrors[disk].head_position);	/* Find the disk whose head is closest */	do {		if (disk <= 0)			disk = conf->raid_disks;		disk--;		if (!conf->mirrors[disk].rdev ||		    !conf->mirrors[disk].rdev->in_sync)			continue;		if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) {			new_disk = disk;			break;		}		new_distance = abs(this_sector - conf->mirrors[disk].head_position);		if (new_distance < current_distance) {			current_distance = new_distance;			new_disk = disk;		}	} while (disk != conf->last_used);rb_out:	if (new_disk >= 0) {		conf->next_seq_sect = this_sector + sectors;		conf->last_used = new_disk;		atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending);	}	spin_unlock_irq(&conf->device_lock);	return new_disk;}static void unplug_slaves(mddev_t *mddev){	conf_t *conf = mddev_to_conf(mddev);	int i;	unsigned long flags;	spin_lock_irqsave(&conf->device_lock, flags);	for (i=0; i<mddev->raid_disks; i++) {		mdk_rdev_t *rdev = conf->mirrors[i].rdev;		if (rdev && atomic_read(&rdev->nr_pending)) {			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);			atomic_inc(&rdev->nr_pending);			spin_unlock_irqrestore(&conf->device_lock, flags);			if (r_queue->unplug_fn)				r_queue->unplug_fn(r_queue);			spin_lock_irqsave(&conf->device_lock, flags);			atomic_dec(&rdev->nr_pending);		}	}	spin_unlock_irqrestore(&conf->device_lock, flags);}static void raid1_unplug(request_queue_t *q){	unplug_slaves(q->queuedata);}static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,			     sector_t *error_sector){	mddev_t *mddev = q->queuedata;	conf_t *conf = mddev_to_conf(mddev);	unsigned long flags;	int i, ret = 0;	spin_lock_irqsave(&conf->device_lock, flags);	for (i=0; i<mddev->raid_disks; i++) {		mdk_rdev_t *rdev = conf->mirrors[i].rdev;		if (rdev && !rdev->faulty) {			struct block_device *bdev = rdev->bdev;			request_queue_t *r_queue = bdev_get_queue(bdev);			if (r_queue->issue_flush_fn) {				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);				if (ret)					break;			}		}	}	spin_unlock_irqrestore(&conf->device_lock, flags);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?