md.c

来自「linux 内核源代码」· C语言 代码 · 共 2,595 行 · 第 1/5 页

C
2,595
字号
/*   md.c : Multiple Devices driver for Linux	  Copyright (C) 1998, 1999, 2000 Ingo Molnar     completely rewritten, based on the MD driver code from Marc Zyngier   Changes:   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>   - kmod support by: Cyrus Durgin   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>   - lots of fixes and improvements to the RAID1/RAID5 and generic     RAID code (such as request based resynchronization):     Neil Brown <neilb@cse.unsw.edu.au>.   - persistent bitmap code     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.   This program is free software; you can redistribute it and/or modify   it under the terms of the GNU General Public License as published by   the Free Software Foundation; either version 2, or (at your option)   any later version.   You should have received a copy of the GNU General Public License   (for example /usr/src/linux/COPYING); if not, write to the Free   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.*/#include <linux/module.h>#include <linux/kernel.h>#include <linux/kthread.h>#include <linux/linkage.h>#include <linux/raid/md.h>#include <linux/raid/bitmap.h>#include <linux/sysctl.h>#include <linux/buffer_head.h> /* for invalidate_bdev */#include <linux/poll.h>#include <linux/mutex.h>#include <linux/ctype.h>#include <linux/freezer.h>#include <linux/init.h>#include <linux/file.h>#ifdef CONFIG_KMOD#include <linux/kmod.h>#endif#include <asm/unaligned.h>#define MAJOR_NR MD_MAJOR#define MD_DRIVER/* 63 partitions with the alternate major number (mdp) */#define MdpMinorShift 6#define DEBUG 0#define dprintk(x...) ((void)(DEBUG && printk(x)))#ifndef MODULEstatic void autostart_arrays (int part);#endifstatic LIST_HEAD(pers_list);static DEFINE_SPINLOCK(pers_lock);static void md_print_devices(void);#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }/* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. * Increase it if you want to have more _guaranteed_ speed. Note that * the RAID driver will use the maximum available bandwidth if the IO * subsystem is idle. There is also an 'absolute maximum' reconstruction * speed limit - in case reconstruction slows down your system despite * idle IO detection. * * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. * or /sys/block/mdX/md/sync_speed_{min,max} */static int sysctl_speed_limit_min = 1000;static int sysctl_speed_limit_max = 200000;static inline int speed_min(mddev_t *mddev){	return mddev->sync_speed_min ?		mddev->sync_speed_min : sysctl_speed_limit_min;}static inline int speed_max(mddev_t *mddev){	return mddev->sync_speed_max ?		mddev->sync_speed_max : sysctl_speed_limit_max;}static struct ctl_table_header *raid_table_header;static ctl_table raid_table[] = {	{		.ctl_name	= DEV_RAID_SPEED_LIMIT_MIN,		.procname	= "speed_limit_min",		.data		= &sysctl_speed_limit_min,		.maxlen		= sizeof(int),		.mode		= S_IRUGO|S_IWUSR,		.proc_handler	= &proc_dointvec,	},	{		.ctl_name	= DEV_RAID_SPEED_LIMIT_MAX,		.procname	= "speed_limit_max",		.data		= &sysctl_speed_limit_max,		.maxlen		= sizeof(int),		.mode		= S_IRUGO|S_IWUSR,		.proc_handler	= &proc_dointvec,	},	{ .ctl_name = 0 }};static ctl_table raid_dir_table[] = {	{		.ctl_name	= DEV_RAID,		.procname	= "raid",		.maxlen		= 0,		.mode		= S_IRUGO|S_IXUGO,		.child		= raid_table,	},	{ .ctl_name = 0 }};static ctl_table raid_root_table[] = {	{		.ctl_name	= CTL_DEV,		.procname	= "dev",		.maxlen		= 0,		.mode		= 0555,		.child		= raid_dir_table,	},	{ .ctl_name = 0 }};static struct block_device_operations md_fops;static int start_readonly;/* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat * can use 'poll' or 'select' to find out when the event * count increases. * * Events are: *  start array, stop array, error, add device, remove device, *  start build, activate spare */static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);static atomic_t md_event_count;void md_new_event(mddev_t *mddev){	atomic_inc(&md_event_count);	wake_up(&md_event_waiters);	sysfs_notify(&mddev->kobj, NULL, "sync_action");}EXPORT_SYMBOL_GPL(md_new_event);/* Alternate version that can be called from interrupts * when calling sysfs_notify isn't needed. */static void md_new_event_inintr(mddev_t *mddev){	atomic_inc(&md_event_count);	wake_up(&md_event_waiters);}/* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. */static LIST_HEAD(all_mddevs);static DEFINE_SPINLOCK(all_mddevs_lock);/* * iterates through all used mddevs in the system. * We take care to grab the all_mddevs_lock whenever navigating * the list, and to always hold a refcount when unlocked. * Any code which breaks out of this loop while own * a reference to the current mddev and must mddev_put it. */#define ITERATE_MDDEV(mddev,tmp)					\									\	for (({ spin_lock(&all_mddevs_lock); 				\		tmp = all_mddevs.next;					\		mddev = NULL;});					\	     ({ if (tmp != &all_mddevs)					\			mddev_get(list_entry(tmp, mddev_t, all_mddevs));\		spin_unlock(&all_mddevs_lock);				\		if (mddev) mddev_put(mddev);				\		mddev = list_entry(tmp, mddev_t, all_mddevs);		\		tmp != &all_mddevs;});					\	     ({ spin_lock(&all_mddevs_lock);				\		tmp = tmp->next;})					\		)static int md_fail_request (struct request_queue *q, struct bio *bio){	bio_io_error(bio);	return 0;}static inline mddev_t *mddev_get(mddev_t *mddev){	atomic_inc(&mddev->active);	return mddev;}static void mddev_put(mddev_t *mddev){	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))		return;	if (!mddev->raid_disks && list_empty(&mddev->disks)) {		list_del(&mddev->all_mddevs);		spin_unlock(&all_mddevs_lock);		blk_cleanup_queue(mddev->queue);		kobject_unregister(&mddev->kobj);	} else		spin_unlock(&all_mddevs_lock);}static mddev_t * mddev_find(dev_t unit){	mddev_t *mddev, *new = NULL; retry:	spin_lock(&all_mddevs_lock);	list_for_each_entry(mddev, &all_mddevs, all_mddevs)		if (mddev->unit == unit) {			mddev_get(mddev);			spin_unlock(&all_mddevs_lock);			kfree(new);			return mddev;		}	if (new) {		list_add(&new->all_mddevs, &all_mddevs);		spin_unlock(&all_mddevs_lock);		return new;	}	spin_unlock(&all_mddevs_lock);	new = kzalloc(sizeof(*new), GFP_KERNEL);	if (!new)		return NULL;	new->unit = unit;	if (MAJOR(unit) == MD_MAJOR)		new->md_minor = MINOR(unit);	else		new->md_minor = MINOR(unit) >> MdpMinorShift;	mutex_init(&new->reconfig_mutex);	INIT_LIST_HEAD(&new->disks);	INIT_LIST_HEAD(&new->all_mddevs);	init_timer(&new->safemode_timer);	atomic_set(&new->active, 1);	spin_lock_init(&new->write_lock);	init_waitqueue_head(&new->sb_wait);	new->reshape_position = MaxSector;	new->queue = blk_alloc_queue(GFP_KERNEL);	if (!new->queue) {		kfree(new);		return NULL;	}	set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);	blk_queue_make_request(new->queue, md_fail_request);	goto retry;}static inline int mddev_lock(mddev_t * mddev){	return mutex_lock_interruptible(&mddev->reconfig_mutex);}static inline int mddev_trylock(mddev_t * mddev){	return mutex_trylock(&mddev->reconfig_mutex);}static inline void mddev_unlock(mddev_t * mddev){	mutex_unlock(&mddev->reconfig_mutex);	md_wakeup_thread(mddev->thread);}static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr){	mdk_rdev_t * rdev;	struct list_head *tmp;	ITERATE_RDEV(mddev,rdev,tmp) {		if (rdev->desc_nr == nr)			return rdev;	}	return NULL;}static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev){	struct list_head *tmp;	mdk_rdev_t *rdev;	ITERATE_RDEV(mddev,rdev,tmp) {		if (rdev->bdev->bd_dev == dev)			return rdev;	}	return NULL;}static struct mdk_personality *find_pers(int level, char *clevel){	struct mdk_personality *pers;	list_for_each_entry(pers, &pers_list, list) {		if (level != LEVEL_NONE && pers->level == level)			return pers;		if (strcmp(pers->name, clevel)==0)			return pers;	}	return NULL;}static inline sector_t calc_dev_sboffset(struct block_device *bdev){	sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;	return MD_NEW_SIZE_BLOCKS(size);}static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size){	sector_t size;	size = rdev->sb_offset;	if (chunk_size)		size &= ~((sector_t)chunk_size/1024 - 1);	return size;}static int alloc_disk_sb(mdk_rdev_t * rdev){	if (rdev->sb_page)		MD_BUG();	rdev->sb_page = alloc_page(GFP_KERNEL);	if (!rdev->sb_page) {		printk(KERN_ALERT "md: out of memory.\n");		return -EINVAL;	}	return 0;}static void free_disk_sb(mdk_rdev_t * rdev){	if (rdev->sb_page) {		put_page(rdev->sb_page);		rdev->sb_loaded = 0;		rdev->sb_page = NULL;		rdev->sb_offset = 0;		rdev->size = 0;	}}static void super_written(struct bio *bio, int error){	mdk_rdev_t *rdev = bio->bi_private;	mddev_t *mddev = rdev->mddev;	if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {		printk("md: super_written gets error=%d, uptodate=%d\n",		       error, test_bit(BIO_UPTODATE, &bio->bi_flags));		WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));		md_error(mddev, rdev);	}	if (atomic_dec_and_test(&mddev->pending_writes))		wake_up(&mddev->sb_wait);	bio_put(bio);}static void super_written_barrier(struct bio *bio, int error){	struct bio *bio2 = bio->bi_private;	mdk_rdev_t *rdev = bio2->bi_private;	mddev_t *mddev = rdev->mddev;	if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&	    error == -EOPNOTSUPP) {		unsigned long flags;		/* barriers don't appear to be supported :-( */		set_bit(BarriersNotsupp, &rdev->flags);		mddev->barriers_work = 0;		spin_lock_irqsave(&mddev->write_lock, flags);		bio2->bi_next = mddev->biolist;		mddev->biolist = bio2;		spin_unlock_irqrestore(&mddev->write_lock, flags);		wake_up(&mddev->sb_wait);		bio_put(bio);	} else {		bio_put(bio2);		bio->bi_private = rdev;		super_written(bio, error);	}}void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,		   sector_t sector, int size, struct page *page){	/* write first size bytes of page to sector of rdev	 * Increment mddev->pending_writes before returning	 * and decrement it on completion, waking up sb_wait	 * if zero is reached.	 * If an error occurred, call md_error	 *	 * As we might need to resubmit the request if BIO_RW_BARRIER	 * causes ENOTSUPP, we allocate a spare bio...	 */	struct bio *bio = bio_alloc(GFP_NOIO, 1);	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);	bio->bi_bdev = rdev->bdev;	bio->bi_sector = sector;	bio_add_page(bio, page, size, 0);	bio->bi_private = rdev;	bio->bi_end_io = super_written;	bio->bi_rw = rw;	atomic_inc(&mddev->pending_writes);	if (!test_bit(BarriersNotsupp, &rdev->flags)) {		struct bio *rbio;		rw |= (1<<BIO_RW_BARRIER);		rbio = bio_clone(bio, GFP_NOIO);		rbio->bi_private = bio;		rbio->bi_end_io = super_written_barrier;		submit_bio(rw, rbio);	} else		submit_bio(rw, bio);}void md_super_wait(mddev_t *mddev){	/* wait for all superblock writes that were scheduled to complete.	 * if any had to be retried (due to BARRIER problems), retry them	 */	DEFINE_WAIT(wq);	for(;;) {		prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);		if (atomic_read(&mddev->pending_writes)==0)			break;		while (mddev->biolist) {			struct bio *bio;			spin_lock_irq(&mddev->write_lock);			bio = mddev->biolist;			mddev->biolist = bio->bi_next ;			bio->bi_next = NULL;			spin_unlock_irq(&mddev->write_lock);			submit_bio(bio->bi_rw, bio);		}		schedule();	}	finish_wait(&mddev->sb_wait, &wq);}static void bi_complete(struct bio *bio, int error){	complete((struct completion*)bio->bi_private);}int sync_page_io(struct block_device *bdev, sector_t sector, int size,		   struct page *page, int rw){	struct bio *bio = bio_alloc(GFP_NOIO, 1);	struct completion event;	int ret;	rw |= (1 << BIO_RW_SYNC);	bio->bi_bdev = bdev;	bio->bi_sector = sector;	bio_add_page(bio, page, size, 0);	init_completion(&event);	bio->bi_private = &event;	bio->bi_end_io = bi_complete;	submit_bio(rw, bio);	wait_for_completion(&event);	ret = test_bit(BIO_UPTODATE, &bio->bi_flags);	bio_put(bio);	return ret;}EXPORT_SYMBOL_GPL(sync_page_io);static int read_disk_sb(mdk_rdev_t * rdev, int size){	char b[BDEVNAME_SIZE];	if (!rdev->sb_page) {

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?