📄 dm-raid1.c
字号:
/* * Copyright (C) 2003 Sistina Software Limited. * * This file is released under the GPL. */#include "dm.h"#include "dm-bio-list.h"#include "dm-io.h"#include "dm-log.h"#include "kcopyd.h"#include <linux/ctype.h>#include <linux/init.h>#include <linux/mempool.h>#include <linux/module.h>#include <linux/pagemap.h>#include <linux/slab.h>#include <linux/time.h>#include <linux/vmalloc.h>#include <linux/workqueue.h>static struct workqueue_struct *_kmirrord_wq;static struct work_struct _kmirrord_work;static inline void wake(void){ queue_work(_kmirrord_wq, &_kmirrord_work);}/*----------------------------------------------------------------- * Region hash * * The mirror splits itself up into discrete regions. Each * region can be in one of three states: clean, dirty, * nosync. There is no need to put clean regions in the hash. * * In addition to being present in the hash table a region _may_ * be present on one of three lists. * * clean_regions: Regions on this list have no io pending to * them, they are in sync, we are no longer interested in them, * they are dull. rh_update_states() will remove them from the * hash table. * * quiesced_regions: These regions have been spun down, ready * for recovery. rh_recovery_start() will remove regions from * this list and hand them to kmirrord, which will schedule the * recovery io with kcopyd. * * recovered_regions: Regions that kcopyd has successfully * recovered. rh_update_states() will now schedule any delayed * io, up the recovery_count, and remove the region from the * hash. * * There are 2 locks: * A rw spin lock 'hash_lock' protects just the hash table, * this is never held in write mode from interrupt context, * which I believe means that we only have to disable irqs when * doing a write lock. * * An ordinary spin lock 'region_lock' that protects the three * lists in the region_hash, with the 'state', 'list' and * 'bhs_delayed' fields of the regions. This is used from irq * context, so all other uses will have to suspend local irqs. *---------------------------------------------------------------*/struct mirror_set;struct region_hash { struct mirror_set *ms; sector_t region_size; unsigned region_shift; /* holds persistent region state */ struct dirty_log *log; /* hash table */ rwlock_t hash_lock; mempool_t *region_pool; unsigned int mask; unsigned int nr_buckets; struct list_head *buckets; spinlock_t region_lock; struct semaphore recovery_count; struct list_head clean_regions; struct list_head quiesced_regions; struct list_head recovered_regions;};enum { RH_CLEAN, RH_DIRTY, RH_NOSYNC, RH_RECOVERING};struct region { struct region_hash *rh; /* FIXME: can we get rid of this ? */ region_t key; int state; struct list_head hash_list; struct list_head list; atomic_t pending; struct bio_list delayed_bios;};/* * Conversion fns */static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio){ return bio->bi_sector >> rh->region_shift;}static inline sector_t region_to_sector(struct region_hash *rh, region_t region){ return region << rh->region_shift;}/* FIXME move this */static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);static void *region_alloc(int gfp_mask, void *pool_data){ return kmalloc(sizeof(struct region), gfp_mask);}static void region_free(void *element, void *pool_data){ kfree(element);}#define MIN_REGIONS 64#define MAX_RECOVERY 1static int rh_init(struct region_hash *rh, struct mirror_set *ms, struct dirty_log *log, sector_t region_size, region_t nr_regions){ unsigned int nr_buckets, max_buckets; size_t i; /* * Calculate a suitable number of buckets for our hash * table. */ max_buckets = nr_regions >> 6; for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) ; nr_buckets >>= 1; rh->ms = ms; rh->log = log; rh->region_size = region_size; rh->region_shift = ffs(region_size) - 1; rwlock_init(&rh->hash_lock); rh->mask = nr_buckets - 1; rh->nr_buckets = nr_buckets; rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); if (!rh->buckets) { DMERR("unable to allocate region hash memory"); return -ENOMEM; } for (i = 0; i < nr_buckets; i++) INIT_LIST_HEAD(rh->buckets + i); spin_lock_init(&rh->region_lock); sema_init(&rh->recovery_count, 0); INIT_LIST_HEAD(&rh->clean_regions); INIT_LIST_HEAD(&rh->quiesced_regions); INIT_LIST_HEAD(&rh->recovered_regions); rh->region_pool = mempool_create(MIN_REGIONS, region_alloc, region_free, NULL); if (!rh->region_pool) { vfree(rh->buckets); rh->buckets = NULL; return -ENOMEM; } return 0;}static void rh_exit(struct region_hash *rh){ unsigned int h; struct region *reg, *nreg; BUG_ON(!list_empty(&rh->quiesced_regions)); for (h = 0; h < rh->nr_buckets; h++) { list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { BUG_ON(atomic_read(®->pending)); mempool_free(reg, rh->region_pool); } } if (rh->log) dm_destroy_dirty_log(rh->log); if (rh->region_pool) mempool_destroy(rh->region_pool); vfree(rh->buckets);}#define RH_HASH_MULT 2654435387Ustatic inline unsigned int rh_hash(struct region_hash *rh, region_t region){ return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;}static struct region *__rh_lookup(struct region_hash *rh, region_t region){ struct region *reg; list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) if (reg->key == region) return reg; return NULL;}static void __rh_insert(struct region_hash *rh, struct region *reg){ unsigned int h = rh_hash(rh, reg->key); list_add(®->hash_list, rh->buckets + h);}static struct region *__rh_alloc(struct region_hash *rh, region_t region){ struct region *reg, *nreg; read_unlock(&rh->hash_lock); nreg = mempool_alloc(rh->region_pool, GFP_NOIO); nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? RH_CLEAN : RH_NOSYNC; nreg->rh = rh; nreg->key = region; INIT_LIST_HEAD(&nreg->list); atomic_set(&nreg->pending, 0); bio_list_init(&nreg->delayed_bios); write_lock_irq(&rh->hash_lock); reg = __rh_lookup(rh, region); if (reg) /* we lost the race */ mempool_free(nreg, rh->region_pool); else { __rh_insert(rh, nreg); if (nreg->state == RH_CLEAN) { spin_lock_irq(&rh->region_lock); list_add(&nreg->list, &rh->clean_regions); spin_unlock_irq(&rh->region_lock); } reg = nreg; } write_unlock_irq(&rh->hash_lock); read_lock(&rh->hash_lock); return reg;}static inline struct region *__rh_find(struct region_hash *rh, region_t region){ struct region *reg; reg = __rh_lookup(rh, region); if (!reg) reg = __rh_alloc(rh, region); return reg;}static int rh_state(struct region_hash *rh, region_t region, int may_block){ int r; struct region *reg; read_lock(&rh->hash_lock); reg = __rh_lookup(rh, region); read_unlock(&rh->hash_lock); if (reg) return reg->state; /* * The region wasn't in the hash, so we fall back to the * dirty log. */ r = rh->log->type->in_sync(rh->log, region, may_block); /* * Any error from the dirty log (eg. -EWOULDBLOCK) gets * taken as a RH_NOSYNC */ return r == 1 ? RH_CLEAN : RH_NOSYNC;}static inline int rh_in_sync(struct region_hash *rh, region_t region, int may_block){ int state = rh_state(rh, region, may_block); return state == RH_CLEAN || state == RH_DIRTY;}static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list){ struct bio *bio; while ((bio = bio_list_pop(bio_list))) { queue_bio(ms, bio, WRITE); }}static void rh_update_states(struct region_hash *rh){ struct region *reg, *next; LIST_HEAD(clean); LIST_HEAD(recovered); /* * Quickly grab the lists. */ write_lock_irq(&rh->hash_lock); spin_lock(&rh->region_lock); if (!list_empty(&rh->clean_regions)) { list_splice(&rh->clean_regions, &clean); INIT_LIST_HEAD(&rh->clean_regions); list_for_each_entry (reg, &clean, list) { rh->log->type->clear_region(rh->log, reg->key); list_del(®->hash_list); } } if (!list_empty(&rh->recovered_regions)) { list_splice(&rh->recovered_regions, &recovered); INIT_LIST_HEAD(&rh->recovered_regions); list_for_each_entry (reg, &recovered, list) list_del(®->hash_list); } spin_unlock(&rh->region_lock); write_unlock_irq(&rh->hash_lock); /* * All the regions on the recovered and clean lists have * now been pulled out of the system, so no need to do * any more locking. */ list_for_each_entry_safe (reg, next, &recovered, list) { rh->log->type->clear_region(rh->log, reg->key); rh->log->type->complete_resync_work(rh->log, reg->key, 1); dispatch_bios(rh->ms, ®->delayed_bios); up(&rh->recovery_count); mempool_free(reg, rh->region_pool); } if (!list_empty(&recovered)) rh->log->type->flush(rh->log); list_for_each_entry_safe (reg, next, &clean, list) mempool_free(reg, rh->region_pool);}static void rh_inc(struct region_hash *rh, region_t region){ struct region *reg; read_lock(&rh->hash_lock); reg = __rh_find(rh, region); if (reg->state == RH_CLEAN) { rh->log->type->mark_region(rh->log, reg->key); spin_lock_irq(&rh->region_lock); reg->state = RH_DIRTY; list_del_init(®->list); /* take off the clean list */ spin_unlock_irq(&rh->region_lock); } atomic_inc(®->pending); read_unlock(&rh->hash_lock);}static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios){ struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) rh_inc(rh, bio_to_region(rh, bio));}static void rh_dec(struct region_hash *rh, region_t region){ unsigned long flags; struct region *reg; int should_wake = 0; read_lock(&rh->hash_lock); reg = __rh_lookup(rh, region); read_unlock(&rh->hash_lock); if (atomic_dec_and_test(®->pending)) { spin_lock_irqsave(&rh->region_lock, flags); if (reg->state == RH_RECOVERING) { list_add_tail(®->list, &rh->quiesced_regions); } else { reg->state = RH_CLEAN; list_add(®->list, &rh->clean_regions); } spin_unlock_irqrestore(&rh->region_lock, flags); should_wake = 1; } if (should_wake) wake();}/* * Starts quiescing a region in preparation for recovery. */static int __rh_recovery_prepare(struct region_hash *rh){ int r; struct region *reg; region_t region; /* * Ask the dirty log what's next. */ r = rh->log->type->get_resync_work(rh->log, ®ion); if (r <= 0) return r; /* * Get this region, and start it quiescing by setting the * recovering flag. */ read_lock(&rh->hash_lock); reg = __rh_find(rh, region); read_unlock(&rh->hash_lock); spin_lock_irq(&rh->region_lock); reg->state = RH_RECOVERING; /* Already quiesced ? */ if (atomic_read(®->pending)) list_del_init(®->list); else { list_del_init(®->list); list_add(®->list, &rh->quiesced_regions); } spin_unlock_irq(&rh->region_lock); return 1;}static void rh_recovery_prepare(struct region_hash *rh){ while (!down_trylock(&rh->recovery_count)) if (__rh_recovery_prepare(rh) <= 0) { up(&rh->recovery_count); break; }}/* * Returns any quiesced regions. */static struct region *rh_recovery_start(struct region_hash *rh){ struct region *reg = NULL; spin_lock_irq(&rh->region_lock); if (!list_empty(&rh->quiesced_regions)) { reg = list_entry(rh->quiesced_regions.next, struct region, list); list_del_init(®->list); /* remove from the quiesced list */ } spin_unlock_irq(&rh->region_lock); return reg;}/* FIXME: success ignored for now */static void rh_recovery_end(struct region *reg, int success){ struct region_hash *rh = reg->rh; spin_lock_irq(&rh->region_lock); list_add(®->list, ®->rh->recovered_regions); spin_unlock_irq(&rh->region_lock); wake();}static void rh_flush(struct region_hash *rh){ rh->log->type->flush(rh->log);}static void rh_delay(struct region_hash *rh, struct bio *bio){ struct region *reg; read_lock(&rh->hash_lock); reg = __rh_find(rh, bio_to_region(rh, bio)); bio_list_add(®->delayed_bios, bio); read_unlock(&rh->hash_lock);}static void rh_stop_recovery(struct region_hash *rh){ int i; /* wait for any recovering regions */ for (i = 0; i < MAX_RECOVERY; i++) down(&rh->recovery_count);}static void rh_start_recovery(struct region_hash *rh){ int i; for (i = 0; i < MAX_RECOVERY; i++) up(&rh->recovery_count); wake();}/*----------------------------------------------------------------- * Mirror set structures. *---------------------------------------------------------------*/struct mirror { atomic_t error_count; struct dm_dev *dev; sector_t offset;};struct mirror_set { struct dm_target *ti; struct list_head list; struct region_hash rh; struct kcopyd_client *kcopyd_client; spinlock_t lock; /* protects the next two lists */ struct bio_list reads; struct bio_list writes; /* recovery */ region_t nr_regions; int in_sync; unsigned int nr_mirrors; struct mirror mirror[0];};/* * Every mirror should look like this one. */#define DEFAULT_MIRROR 0/* * This is yucky. We squirrel the mirror_set struct away inside * bi_next for write buffers. This is safe since the bh * doesn't get submitted to the lower levels of block layer. */static struct mirror_set *bio_get_ms(struct bio *bio){ return (struct mirror_set *) bio->bi_next;}static void bio_set_ms(struct bio *bio, struct mirror_set *ms){ bio->bi_next = (struct bio *) ms;}/*----------------------------------------------------------------- * Recovery. * * When a mirror is first activated we may find that some regions * are in the no-sync state. We have to recover these by * recopying from the default mirror to all the others. *---------------------------------------------------------------*/static void recovery_complete(int read_err, unsigned int write_err, void *context){ struct region *reg = (struct region *) context; /* FIXME: better error handling */ rh_recovery_end(reg, read_err || write_err);}static int recover(struct mirror_set *ms, struct region *reg){ int r; unsigned int i; struct io_region from, to[KCOPYD_MAX_REGIONS], *dest; struct mirror *m; unsigned long flags = 0; /* fill in the source */ m = ms->mirror + DEFAULT_MIRROR; from.bdev = m->dev->bdev; from.sector = m->offset + region_to_sector(reg->rh, reg->key); if (reg->key == (ms->nr_regions - 1)) { /* * The final region may be smaller than * region_size. */ from.count = ms->ti->len & (reg->rh->region_size - 1); if (!from.count) from.count = reg->rh->region_size; } else from.count = reg->rh->region_size; /* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { if (i == DEFAULT_MIRROR) continue; m = ms->mirror + i; dest->bdev = m->dev->bdev; dest->sector = m->offset + region_to_sector(reg->rh, reg->key); dest->count = from.count; dest++; } /* hand to kcopyd */ set_bit(KCOPYD_IGNORE_ERROR, &flags);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -