raid5.c
来自「linux 内核源代码」· C语言 代码 · 共 2,325 行 · 第 1/5 页
C
2,325 行
/* * raid5.c : Multiple Devices driver for Linux * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman * Copyright (C) 1999, 2000 Ingo Molnar * Copyright (C) 2002, 2003 H. Peter Anvin * * RAID-4/5/6 management functions. * Thanks to Penguin Computing for making the RAID-6 development possible * by donating a test server! * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * BITMAP UNPLUGGING: * * The sequencing for updating the bitmap reliably is a little * subtle (and I got it wrong the first time) so it deserves some * explanation. * * We group bitmap updates into batches. Each batch has a number. * We may write out several batches at once, but that isn't very important. * conf->bm_write is the number of the last batch successfully written. * conf->bm_flush is the number of the last batch that was closed to * new additions. * When we discover that we will need to write to any block in a stripe * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq * the number of the batch it will be in. This is bm_flush+1. * When we are ready to do a write, if that batch hasn't been written yet, * we plug the array and queue the stripe for later. * When an unplug happens, we increment bm_flush, thus closing the current * batch. * When we notice that bm_flush > bm_write, we write out all pending updates * to the bitmap, and advance bm_write to where bm_flush was. * This may occasionally write a bit out twice, but is sure never to * miss any bits. */#include <linux/module.h>#include <linux/slab.h>#include <linux/highmem.h>#include <linux/bitops.h>#include <linux/kthread.h>#include <asm/atomic.h>#include "raid6.h"#include <linux/raid/bitmap.h>#include <linux/async_tx.h>/* * Stripe cache */#define NR_STRIPES 256#define STRIPE_SIZE PAGE_SIZE#define STRIPE_SHIFT (PAGE_SHIFT - 9)#define STRIPE_SECTORS (STRIPE_SIZE>>9)#define IO_THRESHOLD 1#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))#define HASH_MASK (NR_HASH - 1)#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))/* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap. There may be several bio's per stripe+device, and * a bio could span several devices. * When walking this list for a particular stripe+device, we must never proceed * beyond a bio that extends past this device, as the next bio might no longer * be valid. * This macro is used to determine the 'next' bio in the list, given the sector * of the current stripe+device */#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)/* * The following can be used to debug the driver */#define RAID5_PARANOIA 1#if RAID5_PARANOIA && defined(CONFIG_SMP)# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)#else# define CHECK_DEVLOCK()#endif#ifdef DEBUG#define inline#define __inline__#endif#if !RAID6_USE_EMPTY_ZERO_PAGE/* In .bss so it's zeroed */const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));#endifstatic inline int raid6_next_disk(int disk, int raid_disks){ disk++; return (disk < raid_disks) ? disk : 0;}static void return_io(struct bio *return_bi){ struct bio *bi = return_bi; while (bi) { return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; bi->bi_end_io(bi, test_bit(BIO_UPTODATE, &bi->bi_flags) ? 0 : -EIO); bi = return_bi; }}static void print_raid5_conf (raid5_conf_t *conf);static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh){ if (atomic_dec_and_test(&sh->count)) { BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state)) { list_add_tail(&sh->lru, &conf->delayed_list); blk_plug_device(conf->mddev->queue); } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) { list_add_tail(&sh->lru, &conf->bitmap_list); blk_plug_device(conf->mddev->queue); } else { clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); } md_wakeup_thread(conf->mddev->thread); } else { BUG_ON(sh->ops.pending); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); } atomic_dec(&conf->active_stripes); if (!test_bit(STRIPE_EXPANDING, &sh->state)) { list_add_tail(&sh->lru, &conf->inactive_list); wake_up(&conf->wait_for_stripe); if (conf->retry_read_aligned) md_wakeup_thread(conf->mddev->thread); } } }}static void release_stripe(struct stripe_head *sh){ raid5_conf_t *conf = sh->raid_conf; unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); __release_stripe(conf, sh); spin_unlock_irqrestore(&conf->device_lock, flags);}static inline void remove_hash(struct stripe_head *sh){ pr_debug("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); hlist_del_init(&sh->hash);}static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh){ struct hlist_head *hp = stripe_hash(conf, sh->sector); pr_debug("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); CHECK_DEVLOCK(); hlist_add_head(&sh->hash, hp);}/* find an idle stripe, make sure it is unhashed, and return it. */static struct stripe_head *get_free_stripe(raid5_conf_t *conf){ struct stripe_head *sh = NULL; struct list_head *first; CHECK_DEVLOCK(); if (list_empty(&conf->inactive_list)) goto out; first = conf->inactive_list.next; sh = list_entry(first, struct stripe_head, lru); list_del_init(first); remove_hash(sh); atomic_inc(&conf->active_stripes);out: return sh;}static void shrink_buffers(struct stripe_head *sh, int num){ struct page *p; int i; for (i=0; i<num ; i++) { p = sh->dev[i].page; if (!p) continue; sh->dev[i].page = NULL; put_page(p); }}static int grow_buffers(struct stripe_head *sh, int num){ int i; for (i=0; i<num; i++) { struct page *page; if (!(page = alloc_page(GFP_KERNEL))) { return 1; } sh->dev[i].page = page; } return 0;}static void raid5_build_block (struct stripe_head *sh, int i);static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks){ raid5_conf_t *conf = sh->raid_conf; int i; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); CHECK_DEVLOCK(); pr_debug("init_stripe called, stripe %llu\n", (unsigned long long)sh->sector); remove_hash(sh); sh->sector = sector; sh->pd_idx = pd_idx; sh->state = 0; sh->disks = disks; for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->toread || dev->read || dev->towrite || dev->written || test_bit(R5_LOCKED, &dev->flags)) { printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", (unsigned long long)sh->sector, i, dev->toread, dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); BUG(); } dev->flags = 0; raid5_build_block(sh, i); } insert_hash(conf, sh);}static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks){ struct stripe_head *sh; struct hlist_node *hn; CHECK_DEVLOCK(); pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) if (sh->sector == sector && sh->disks == disks) return sh; pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); return NULL;}static void unplug_slaves(mddev_t *mddev);static void raid5_unplug_device(struct request_queue *q);static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, int pd_idx, int noblock){ struct stripe_head *sh; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); spin_lock_irq(&conf->device_lock); do { wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector, disks); if (!sh) { if (!conf->inactive_blocked) sh = get_free_stripe(conf); if (noblock && sh == NULL) break; if (!sh) { conf->inactive_blocked = 1; wait_event_lock_irq(conf->wait_for_stripe, !list_empty(&conf->inactive_list) && (atomic_read(&conf->active_stripes) < (conf->max_nr_stripes *3/4) || !conf->inactive_blocked), conf->device_lock, raid5_unplug_device(conf->mddev->queue) ); conf->inactive_blocked = 0; } else init_stripe(sh, sector, pd_idx, disks); } else { if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru)); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); if (list_empty(&sh->lru) && !test_bit(STRIPE_EXPANDING, &sh->state)) BUG(); list_del_init(&sh->lru); } } } while (sh == NULL); if (sh) atomic_inc(&sh->count); spin_unlock_irq(&conf->device_lock); return sh;}/* test_and_ack_op() ensures that we only dequeue an operation once */#define test_and_ack_op(op, pend) \do { \ if (test_bit(op, &sh->ops.pending) && \ !test_bit(op, &sh->ops.complete)) { \ if (test_and_set_bit(op, &sh->ops.ack)) \ clear_bit(op, &pend); \ else \ ack++; \ } else \ clear_bit(op, &pend); \} while (0)/* find new work to run, do not resubmit work that is already * in flight */static unsigned long get_stripe_work(struct stripe_head *sh){ unsigned long pending; int ack = 0; pending = sh->ops.pending; test_and_ack_op(STRIPE_OP_BIOFILL, pending); test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); test_and_ack_op(STRIPE_OP_PREXOR, pending); test_and_ack_op(STRIPE_OP_BIODRAIN, pending); test_and_ack_op(STRIPE_OP_POSTXOR, pending); test_and_ack_op(STRIPE_OP_CHECK, pending); if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) ack++; sh->ops.count -= ack; if (unlikely(sh->ops.count < 0)) { printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " "ops.complete: %#lx\n", pending, sh->ops.pending, sh->ops.ack, sh->ops.complete); BUG(); } return pending;}static voidraid5_end_read_request(struct bio *bi, int error);static voidraid5_end_write_request(struct bio *bi, int error);static void ops_run_io(struct stripe_head *sh){ raid5_conf_t *conf = sh->raid_conf; int i, disks = sh->disks; might_sleep(); for (i = disks; i--; ) { int rw; struct bio *bi; mdk_rdev_t *rdev; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) rw = WRITE; else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; else continue; bi = &sh->dev[i].req; bi->bi_rw = rw; if (rw == WRITE) bi->bi_end_io = raid5_end_write_request; else bi->bi_end_io = raid5_end_read_request; rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); if (rdev) { if (test_bit(STRIPE_SYNCING, &sh->state) || test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || test_bit(STRIPE_EXPAND_READY, &sh->state)) md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; pr_debug("%s: for %llu schedule op %ld on disc %d\n", __FUNCTION__, (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); bi->bi_sector = sh->sector + rdev->data_offset; bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_vcnt = 1; bi->bi_max_vecs = 1; bi->bi_idx = 0; bi->bi_io_vec = &sh->dev[i].vec; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; bi->bi_next = NULL; if (rw == WRITE && test_bit(R5_ReWrite, &sh->dev[i].flags)) atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); generic_make_request(bi); } else { if (rw == WRITE) set_bit(STRIPE_DEGRADED, &sh->state); pr_debug("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); } }}static struct dma_async_tx_descriptor *
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?