raid1.c

来自「linux 内核源代码」· C语言 代码 · 共 2,200 行 · 第 1/4 页

C
2,200
字号
/* * raid1.c : Multiple Devices driver for Linux * * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat * * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman * * RAID-1 management functions. * * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 * * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> * * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support * bitmapped intelligence in resync: * *      - bitmap marked during normal i/o *      - bitmap used to skip nondirty blocks during sync * * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: * - persistent bitmap code * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include "dm-bio-list.h"#include <linux/raid/raid1.h>#include <linux/raid/bitmap.h>#define DEBUG 0#if DEBUG#define PRINTK(x...) printk(x)#else#define PRINTK(x...)#endif/* * Number of guaranteed r1bios in case of extreme VM load: */#define	NR_RAID1_BIOS 256static void unplug_slaves(mddev_t *mddev);static void allow_barrier(conf_t *conf);static void lower_barrier(conf_t *conf);static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data){	struct pool_info *pi = data;	r1bio_t *r1_bio;	int size = offsetof(r1bio_t, bios[pi->raid_disks]);	/* allocate a r1bio with room for raid_disks entries in the bios array */	r1_bio = kzalloc(size, gfp_flags);	if (!r1_bio)		unplug_slaves(pi->mddev);	return r1_bio;}static void r1bio_pool_free(void *r1_bio, void *data){	kfree(r1_bio);}#define RESYNC_BLOCK_SIZE (64*1024)//#define RESYNC_BLOCK_SIZE PAGE_SIZE#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)#define RESYNC_WINDOW (2048*1024)static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data){	struct pool_info *pi = data;	struct page *page;	r1bio_t *r1_bio;	struct bio *bio;	int i, j;	r1_bio = r1bio_pool_alloc(gfp_flags, pi);	if (!r1_bio) {		unplug_slaves(pi->mddev);		return NULL;	}	/*	 * Allocate bios : 1 for reading, n-1 for writing	 */	for (j = pi->raid_disks ; j-- ; ) {		bio = bio_alloc(gfp_flags, RESYNC_PAGES);		if (!bio)			goto out_free_bio;		r1_bio->bios[j] = bio;	}	/*	 * Allocate RESYNC_PAGES data pages and attach them to	 * the first bio.	 * If this is a user-requested check/repair, allocate	 * RESYNC_PAGES for each bio.	 */	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))		j = pi->raid_disks;	else		j = 1;	while(j--) {		bio = r1_bio->bios[j];		for (i = 0; i < RESYNC_PAGES; i++) {			page = alloc_page(gfp_flags);			if (unlikely(!page))				goto out_free_pages;			bio->bi_io_vec[i].bv_page = page;		}	}	/* If not user-requests, copy the page pointers to all bios */	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {		for (i=0; i<RESYNC_PAGES ; i++)			for (j=1; j<pi->raid_disks; j++)				r1_bio->bios[j]->bi_io_vec[i].bv_page =					r1_bio->bios[0]->bi_io_vec[i].bv_page;	}	r1_bio->master_bio = NULL;	return r1_bio;out_free_pages:	for (i=0; i < RESYNC_PAGES ; i++)		for (j=0 ; j < pi->raid_disks; j++)			safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);	j = -1;out_free_bio:	while ( ++j < pi->raid_disks )		bio_put(r1_bio->bios[j]);	r1bio_pool_free(r1_bio, data);	return NULL;}static void r1buf_pool_free(void *__r1_bio, void *data){	struct pool_info *pi = data;	int i,j;	r1bio_t *r1bio = __r1_bio;	for (i = 0; i < RESYNC_PAGES; i++)		for (j = pi->raid_disks; j-- ;) {			if (j == 0 ||			    r1bio->bios[j]->bi_io_vec[i].bv_page !=			    r1bio->bios[0]->bi_io_vec[i].bv_page)				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);		}	for (i=0 ; i < pi->raid_disks; i++)		bio_put(r1bio->bios[i]);	r1bio_pool_free(r1bio, data);}static void put_all_bios(conf_t *conf, r1bio_t *r1_bio){	int i;	for (i = 0; i < conf->raid_disks; i++) {		struct bio **bio = r1_bio->bios + i;		if (*bio && *bio != IO_BLOCKED)			bio_put(*bio);		*bio = NULL;	}}static void free_r1bio(r1bio_t *r1_bio){	conf_t *conf = mddev_to_conf(r1_bio->mddev);	/*	 * Wake up any possible resync thread that waits for the device	 * to go idle.	 */	allow_barrier(conf);	put_all_bios(conf, r1_bio);	mempool_free(r1_bio, conf->r1bio_pool);}static void put_buf(r1bio_t *r1_bio){	conf_t *conf = mddev_to_conf(r1_bio->mddev);	int i;	for (i=0; i<conf->raid_disks; i++) {		struct bio *bio = r1_bio->bios[i];		if (bio->bi_end_io)			rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);	}	mempool_free(r1_bio, conf->r1buf_pool);	lower_barrier(conf);}static void reschedule_retry(r1bio_t *r1_bio){	unsigned long flags;	mddev_t *mddev = r1_bio->mddev;	conf_t *conf = mddev_to_conf(mddev);	spin_lock_irqsave(&conf->device_lock, flags);	list_add(&r1_bio->retry_list, &conf->retry_list);	conf->nr_queued ++;	spin_unlock_irqrestore(&conf->device_lock, flags);	wake_up(&conf->wait_barrier);	md_wakeup_thread(mddev->thread);}/* * raid_end_bio_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */static void raid_end_bio_io(r1bio_t *r1_bio){	struct bio *bio = r1_bio->master_bio;	/* if nobody has done the final endio yet, do it now */	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {		PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",			(bio_data_dir(bio) == WRITE) ? "write" : "read",			(unsigned long long) bio->bi_sector,			(unsigned long long) bio->bi_sector +				(bio->bi_size >> 9) - 1);		bio_endio(bio,			test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);	}	free_r1bio(r1_bio);}/* * Update disk head position estimator based on IRQ completion info. */static inline void update_head_pos(int disk, r1bio_t *r1_bio){	conf_t *conf = mddev_to_conf(r1_bio->mddev);	conf->mirrors[disk].head_position =		r1_bio->sector + (r1_bio->sectors);}static void raid1_end_read_request(struct bio *bio, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);	int mirror;	conf_t *conf = mddev_to_conf(r1_bio->mddev);	mirror = r1_bio->read_disk;	/*	 * this branch is our 'one mirror IO has finished' event handler:	 */	update_head_pos(mirror, r1_bio);	if (uptodate)		set_bit(R1BIO_Uptodate, &r1_bio->state);	else {		/* If all other devices have failed, we want to return		 * the error upwards rather than fail the last device.		 * Here we redefine "uptodate" to mean "Don't want to retry"		 */		unsigned long flags;		spin_lock_irqsave(&conf->device_lock, flags);		if (r1_bio->mddev->degraded == conf->raid_disks ||		    (r1_bio->mddev->degraded == conf->raid_disks-1 &&		     !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))			uptodate = 1;		spin_unlock_irqrestore(&conf->device_lock, flags);	}	if (uptodate)		raid_end_bio_io(r1_bio);	else {		/*		 * oops, read error:		 */		char b[BDEVNAME_SIZE];		if (printk_ratelimit())			printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);		reschedule_retry(r1_bio);	}	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);}static void raid1_end_write_request(struct bio *bio, int error){	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);	conf_t *conf = mddev_to_conf(r1_bio->mddev);	struct bio *to_put = NULL;	for (mirror = 0; mirror < conf->raid_disks; mirror++)		if (r1_bio->bios[mirror] == bio)			break;	if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {		set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);		set_bit(R1BIO_BarrierRetry, &r1_bio->state);		r1_bio->mddev->barriers_work = 0;		/* Don't rdev_dec_pending in this branch - keep it for the retry */	} else {		/*		 * this branch is our 'one mirror IO has finished' event handler:		 */		r1_bio->bios[mirror] = NULL;		to_put = bio;		if (!uptodate) {			md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);			/* an I/O failed, we can't clear the bitmap */			set_bit(R1BIO_Degraded, &r1_bio->state);		} else			/*			 * Set R1BIO_Uptodate in our master bio, so that			 * we will return a good error code for to the higher			 * levels even if IO on some other mirrored buffer fails.			 *			 * The 'master' represents the composite IO operation to			 * user-side. So if something waits for IO, then it will			 * wait for the 'master' bio.			 */			set_bit(R1BIO_Uptodate, &r1_bio->state);		update_head_pos(mirror, r1_bio);		if (behind) {			if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))				atomic_dec(&r1_bio->behind_remaining);			/* In behind mode, we ACK the master bio once the I/O has safely			 * reached all non-writemostly disks. Setting the Returned bit			 * ensures that this gets done only once -- we don't ever want to			 * return -EIO here, instead we'll wait */			if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&			    test_bit(R1BIO_Uptodate, &r1_bio->state)) {				/* Maybe we can return now */				if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {					struct bio *mbio = r1_bio->master_bio;					PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",					       (unsigned long long) mbio->bi_sector,					       (unsigned long long) mbio->bi_sector +					       (mbio->bi_size >> 9) - 1);					bio_endio(mbio, 0);				}			}		}		rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);	}	/*	 *	 * Let's see if all mirrored write operations have finished	 * already.	 */	if (atomic_dec_and_test(&r1_bio->remaining)) {		if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))			reschedule_retry(r1_bio);		else {			/* it really is the end of this request */			if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {				/* free extra copy of the data pages */				int i = bio->bi_vcnt;				while (i--)					safe_put_page(bio->bi_io_vec[i].bv_page);			}			/* clear the bitmap if all writes complete successfully */			bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,					r1_bio->sectors,					!test_bit(R1BIO_Degraded, &r1_bio->state),					behind);			md_write_end(r1_bio->mddev);			raid_end_bio_io(r1_bio);		}	}	if (to_put)		bio_put(to_put);}/* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented. */static int read_balance(conf_t *conf, r1bio_t *r1_bio){	const unsigned long this_sector = r1_bio->sector;	int new_disk = conf->last_used, disk = new_disk;	int wonly_disk = -1;	const int sectors = r1_bio->sectors;	sector_t new_distance, current_distance;	mdk_rdev_t *rdev;	rcu_read_lock();	/*	 * Check if we can balance. We can balance on the whole	 * device if no resync is going on, or below the resync window.	 * We take the first readable disk when above the resync window.	 */ retry:	if (conf->mddev->recovery_cp < MaxSector &&	    (this_sector + sectors >= conf->next_resync)) {		/* Choose the first operation device, for consistancy */		new_disk = 0;		for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);		     r1_bio->bios[new_disk] == IO_BLOCKED ||		     !rdev || !test_bit(In_sync, &rdev->flags)			     || test_bit(WriteMostly, &rdev->flags);		     rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {			if (rdev && test_bit(In_sync, &rdev->flags) &&				r1_bio->bios[new_disk] != IO_BLOCKED)				wonly_disk = new_disk;			if (new_disk == conf->raid_disks - 1) {				new_disk = wonly_disk;				break;			}		}		goto rb_out;	}	/* make sure the disk is operational */	for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);	     r1_bio->bios[new_disk] == IO_BLOCKED ||	     !rdev || !test_bit(In_sync, &rdev->flags) ||		     test_bit(WriteMostly, &rdev->flags);	     rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {		if (rdev && test_bit(In_sync, &rdev->flags) &&		    r1_bio->bios[new_disk] != IO_BLOCKED)			wonly_disk = new_disk;		if (new_disk <= 0)			new_disk = conf->raid_disks;		new_disk--;		if (new_disk == disk) {			new_disk = wonly_disk;			break;		}	}	if (new_disk < 0)		goto rb_out;	disk = new_disk;	/* now disk == new_disk == starting point for search */	/*	 * Don't change to another disk for sequential reads:	 */	if (conf->next_seq_sect == this_sector)		goto rb_out;	if (this_sector == conf->mirrors[new_disk].head_position)		goto rb_out;	current_distance = abs(this_sector - conf->mirrors[disk].head_position);	/* Find the disk whose head is closest */	do {		if (disk <= 0)			disk = conf->raid_disks;		disk--;		rdev = rcu_dereference(conf->mirrors[disk].rdev);		if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||		    !test_bit(In_sync, &rdev->flags) ||		    test_bit(WriteMostly, &rdev->flags))			continue;		if (!atomic_read(&rdev->nr_pending)) {			new_disk = disk;			break;		}		new_distance = abs(this_sector - conf->mirrors[disk].head_position);		if (new_distance < current_distance) {			current_distance = new_distance;			new_disk = disk;		}	} while (disk != conf->last_used); rb_out:	if (new_disk >= 0) {		rdev = rcu_dereference(conf->mirrors[new_disk].rdev);		if (!rdev)			goto retry;		atomic_inc(&rdev->nr_pending);		if (!test_bit(In_sync, &rdev->flags)) {			/* cannot risk returning a device that failed			 * before we inc'ed nr_pending			 */			rdev_dec_pending(rdev, conf->mddev);			goto retry;		}		conf->next_seq_sect = this_sector + sectors;		conf->last_used = new_disk;	}	rcu_read_unlock();	return new_disk;}static void unplug_slaves(mddev_t *mddev){	conf_t *conf = mddev_to_conf(mddev);	int i;	rcu_read_lock();	for (i=0; i<mddev->raid_disks; i++) {		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);			atomic_inc(&rdev->nr_pending);			rcu_read_unlock();

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?