md.c
来自「linux 内核源代码」· C语言 代码 · 共 2,595 行 · 第 1/5 页
C
2,595 行
/* md.c : Multiple Devices driver for Linux Copyright (C) 1998, 1999, 2000 Ingo Molnar completely rewritten, based on the MD driver code from Marc Zyngier Changes: - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> - kerneld support by Boris Tobotras <boris@xtalk.msk.su> - kmod support by: Cyrus Durgin - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> - lots of fixes and improvements to the RAID1/RAID5 and generic RAID code (such as request based resynchronization): Neil Brown <neilb@cse.unsw.edu.au>. - persistent bitmap code Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. You should have received a copy of the GNU General Public License (for example /usr/src/linux/COPYING); if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.*/#include <linux/module.h>#include <linux/kernel.h>#include <linux/kthread.h>#include <linux/linkage.h>#include <linux/raid/md.h>#include <linux/raid/bitmap.h>#include <linux/sysctl.h>#include <linux/buffer_head.h> /* for invalidate_bdev */#include <linux/poll.h>#include <linux/mutex.h>#include <linux/ctype.h>#include <linux/freezer.h>#include <linux/init.h>#include <linux/file.h>#ifdef CONFIG_KMOD#include <linux/kmod.h>#endif#include <asm/unaligned.h>#define MAJOR_NR MD_MAJOR#define MD_DRIVER/* 63 partitions with the alternate major number (mdp) */#define MdpMinorShift 6#define DEBUG 0#define dprintk(x...) ((void)(DEBUG && printk(x)))#ifndef MODULEstatic void autostart_arrays (int part);#endifstatic LIST_HEAD(pers_list);static DEFINE_SPINLOCK(pers_lock);static void md_print_devices(void);#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }/* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. * Increase it if you want to have more _guaranteed_ speed. Note that * the RAID driver will use the maximum available bandwidth if the IO * subsystem is idle. There is also an 'absolute maximum' reconstruction * speed limit - in case reconstruction slows down your system despite * idle IO detection. * * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. * or /sys/block/mdX/md/sync_speed_{min,max} */static int sysctl_speed_limit_min = 1000;static int sysctl_speed_limit_max = 200000;static inline int speed_min(mddev_t *mddev){ return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min;}static inline int speed_max(mddev_t *mddev){ return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max;}static struct ctl_table_header *raid_table_header;static ctl_table raid_table[] = { { .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, .proc_handler = &proc_dointvec, }, { .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, .proc_handler = &proc_dointvec, }, { .ctl_name = 0 }};static ctl_table raid_dir_table[] = { { .ctl_name = DEV_RAID, .procname = "raid", .maxlen = 0, .mode = S_IRUGO|S_IXUGO, .child = raid_table, }, { .ctl_name = 0 }};static ctl_table raid_root_table[] = { { .ctl_name = CTL_DEV, .procname = "dev", .maxlen = 0, .mode = 0555, .child = raid_dir_table, }, { .ctl_name = 0 }};static struct block_device_operations md_fops;static int start_readonly;/* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat * can use 'poll' or 'select' to find out when the event * count increases. * * Events are: * start array, stop array, error, add device, remove device, * start build, activate spare */static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);static atomic_t md_event_count;void md_new_event(mddev_t *mddev){ atomic_inc(&md_event_count); wake_up(&md_event_waiters); sysfs_notify(&mddev->kobj, NULL, "sync_action");}EXPORT_SYMBOL_GPL(md_new_event);/* Alternate version that can be called from interrupts * when calling sysfs_notify isn't needed. */static void md_new_event_inintr(mddev_t *mddev){ atomic_inc(&md_event_count); wake_up(&md_event_waiters);}/* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. */static LIST_HEAD(all_mddevs);static DEFINE_SPINLOCK(all_mddevs_lock);/* * iterates through all used mddevs in the system. * We take care to grab the all_mddevs_lock whenever navigating * the list, and to always hold a refcount when unlocked. * Any code which breaks out of this loop while own * a reference to the current mddev and must mddev_put it. */#define ITERATE_MDDEV(mddev,tmp) \ \ for (({ spin_lock(&all_mddevs_lock); \ tmp = all_mddevs.next; \ mddev = NULL;}); \ ({ if (tmp != &all_mddevs) \ mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ spin_unlock(&all_mddevs_lock); \ if (mddev) mddev_put(mddev); \ mddev = list_entry(tmp, mddev_t, all_mddevs); \ tmp != &all_mddevs;}); \ ({ spin_lock(&all_mddevs_lock); \ tmp = tmp->next;}) \ )static int md_fail_request (struct request_queue *q, struct bio *bio){ bio_io_error(bio); return 0;}static inline mddev_t *mddev_get(mddev_t *mddev){ atomic_inc(&mddev->active); return mddev;}static void mddev_put(mddev_t *mddev){ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; if (!mddev->raid_disks && list_empty(&mddev->disks)) { list_del(&mddev->all_mddevs); spin_unlock(&all_mddevs_lock); blk_cleanup_queue(mddev->queue); kobject_unregister(&mddev->kobj); } else spin_unlock(&all_mddevs_lock);}static mddev_t * mddev_find(dev_t unit){ mddev_t *mddev, *new = NULL; retry: spin_lock(&all_mddevs_lock); list_for_each_entry(mddev, &all_mddevs, all_mddevs) if (mddev->unit == unit) { mddev_get(mddev); spin_unlock(&all_mddevs_lock); kfree(new); return mddev; } if (new) { list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); return new; } spin_unlock(&all_mddevs_lock); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; new->unit = unit; if (MAJOR(unit) == MD_MAJOR) new->md_minor = MINOR(unit); else new->md_minor = MINOR(unit) >> MdpMinorShift; mutex_init(&new->reconfig_mutex); INIT_LIST_HEAD(&new->disks); INIT_LIST_HEAD(&new->all_mddevs); init_timer(&new->safemode_timer); atomic_set(&new->active, 1); spin_lock_init(&new->write_lock); init_waitqueue_head(&new->sb_wait); new->reshape_position = MaxSector; new->queue = blk_alloc_queue(GFP_KERNEL); if (!new->queue) { kfree(new); return NULL; } set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); blk_queue_make_request(new->queue, md_fail_request); goto retry;}static inline int mddev_lock(mddev_t * mddev){ return mutex_lock_interruptible(&mddev->reconfig_mutex);}static inline int mddev_trylock(mddev_t * mddev){ return mutex_trylock(&mddev->reconfig_mutex);}static inline void mddev_unlock(mddev_t * mddev){ mutex_unlock(&mddev->reconfig_mutex); md_wakeup_thread(mddev->thread);}static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr){ mdk_rdev_t * rdev; struct list_head *tmp; ITERATE_RDEV(mddev,rdev,tmp) { if (rdev->desc_nr == nr) return rdev; } return NULL;}static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev){ struct list_head *tmp; mdk_rdev_t *rdev; ITERATE_RDEV(mddev,rdev,tmp) { if (rdev->bdev->bd_dev == dev) return rdev; } return NULL;}static struct mdk_personality *find_pers(int level, char *clevel){ struct mdk_personality *pers; list_for_each_entry(pers, &pers_list, list) { if (level != LEVEL_NONE && pers->level == level) return pers; if (strcmp(pers->name, clevel)==0) return pers; } return NULL;}static inline sector_t calc_dev_sboffset(struct block_device *bdev){ sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; return MD_NEW_SIZE_BLOCKS(size);}static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size){ sector_t size; size = rdev->sb_offset; if (chunk_size) size &= ~((sector_t)chunk_size/1024 - 1); return size;}static int alloc_disk_sb(mdk_rdev_t * rdev){ if (rdev->sb_page) MD_BUG(); rdev->sb_page = alloc_page(GFP_KERNEL); if (!rdev->sb_page) { printk(KERN_ALERT "md: out of memory.\n"); return -EINVAL; } return 0;}static void free_disk_sb(mdk_rdev_t * rdev){ if (rdev->sb_page) { put_page(rdev->sb_page); rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_offset = 0; rdev->size = 0; }}static void super_written(struct bio *bio, int error){ mdk_rdev_t *rdev = bio->bi_private; mddev_t *mddev = rdev->mddev; if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { printk("md: super_written gets error=%d, uptodate=%d\n", error, test_bit(BIO_UPTODATE, &bio->bi_flags)); WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); md_error(mddev, rdev); } if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); bio_put(bio);}static void super_written_barrier(struct bio *bio, int error){ struct bio *bio2 = bio->bi_private; mdk_rdev_t *rdev = bio2->bi_private; mddev_t *mddev = rdev->mddev; if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && error == -EOPNOTSUPP) { unsigned long flags; /* barriers don't appear to be supported :-( */ set_bit(BarriersNotsupp, &rdev->flags); mddev->barriers_work = 0; spin_lock_irqsave(&mddev->write_lock, flags); bio2->bi_next = mddev->biolist; mddev->biolist = bio2; spin_unlock_irqrestore(&mddev->write_lock, flags); wake_up(&mddev->sb_wait); bio_put(bio); } else { bio_put(bio2); bio->bi_private = rdev; super_written(bio, error); }}void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page){ /* write first size bytes of page to sector of rdev * Increment mddev->pending_writes before returning * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error * * As we might need to resubmit the request if BIO_RW_BARRIER * causes ENOTSUPP, we allocate a spare bio... */ struct bio *bio = bio_alloc(GFP_NOIO, 1); int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); bio->bi_bdev = rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; bio->bi_rw = rw; atomic_inc(&mddev->pending_writes); if (!test_bit(BarriersNotsupp, &rdev->flags)) { struct bio *rbio; rw |= (1<<BIO_RW_BARRIER); rbio = bio_clone(bio, GFP_NOIO); rbio->bi_private = bio; rbio->bi_end_io = super_written_barrier; submit_bio(rw, rbio); } else submit_bio(rw, bio);}void md_super_wait(mddev_t *mddev){ /* wait for all superblock writes that were scheduled to complete. * if any had to be retried (due to BARRIER problems), retry them */ DEFINE_WAIT(wq); for(;;) { prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); if (atomic_read(&mddev->pending_writes)==0) break; while (mddev->biolist) { struct bio *bio; spin_lock_irq(&mddev->write_lock); bio = mddev->biolist; mddev->biolist = bio->bi_next ; bio->bi_next = NULL; spin_unlock_irq(&mddev->write_lock); submit_bio(bio->bi_rw, bio); } schedule(); } finish_wait(&mddev->sb_wait, &wq);}static void bi_complete(struct bio *bio, int error){ complete((struct completion*)bio->bi_private);}int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw){ struct bio *bio = bio_alloc(GFP_NOIO, 1); struct completion event; int ret; rw |= (1 << BIO_RW_SYNC); bio->bi_bdev = bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); init_completion(&event); bio->bi_private = &event; bio->bi_end_io = bi_complete; submit_bio(rw, bio); wait_for_completion(&event); ret = test_bit(BIO_UPTODATE, &bio->bi_flags); bio_put(bio); return ret;}EXPORT_SYMBOL_GPL(sync_page_io);static int read_disk_sb(mdk_rdev_t * rdev, int size){ char b[BDEVNAME_SIZE]; if (!rdev->sb_page) {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?