raid1.c
来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 1,426 行 · 第 1/3 页
C
1,426 行
/* * raid1.c : Multiple Devices driver for Linux * * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat * * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman * * RAID-1 management functions. * * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 * * Fixes to reconstruction by Jakob 豷tergaard" <jakob@ostenfeld.dk> * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <linux/raid/raid1.h>/* * Number of guaranteed r1bios in case of extreme VM load: */#define NR_RAID1_BIOS 256static mdk_personality_t raid1_personality;static spinlock_t retry_list_lock = SPIN_LOCK_UNLOCKED;static LIST_HEAD(retry_list_head);static void unplug_slaves(mddev_t *mddev);static void * r1bio_pool_alloc(int gfp_flags, void *data){ struct pool_info *pi = data; r1bio_t *r1_bio; int size = offsetof(r1bio_t, bios[pi->raid_disks]); /* allocate a r1bio with room for raid_disks entries in the bios array */ r1_bio = kmalloc(size, gfp_flags); if (r1_bio) memset(r1_bio, 0, size); else unplug_slaves(pi->mddev); return r1_bio;}static void r1bio_pool_free(void *r1_bio, void *data){ kfree(r1_bio);}#define RESYNC_BLOCK_SIZE (64*1024)//#define RESYNC_BLOCK_SIZE PAGE_SIZE#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)#define RESYNC_WINDOW (2048*1024)static void * r1buf_pool_alloc(int gfp_flags, void *data){ struct pool_info *pi = data; struct page *page; r1bio_t *r1_bio; struct bio *bio; int i, j; r1_bio = r1bio_pool_alloc(gfp_flags, pi); if (!r1_bio) { unplug_slaves(pi->mddev); return NULL; } /* * Allocate bios : 1 for reading, n-1 for writing */ for (j = pi->raid_disks ; j-- ; ) { bio = bio_alloc(gfp_flags, RESYNC_PAGES); if (!bio) goto out_free_bio; r1_bio->bios[j] = bio; } /* * Allocate RESYNC_PAGES data pages and attach them to * the first bio; */ bio = r1_bio->bios[0]; for (i = 0; i < RESYNC_PAGES; i++) { page = alloc_page(gfp_flags); if (unlikely(!page)) goto out_free_pages; bio->bi_io_vec[i].bv_page = page; } r1_bio->master_bio = NULL; return r1_bio;out_free_pages: for ( ; i > 0 ; i--) __free_page(bio->bi_io_vec[i-1].bv_page);out_free_bio: while ( ++j < pi->raid_disks ) bio_put(r1_bio->bios[j]); r1bio_pool_free(r1_bio, data); return NULL;}static void r1buf_pool_free(void *__r1_bio, void *data){ struct pool_info *pi = data; int i; r1bio_t *r1bio = __r1_bio; struct bio *bio = r1bio->bios[0]; for (i = 0; i < RESYNC_PAGES; i++) { __free_page(bio->bi_io_vec[i].bv_page); bio->bi_io_vec[i].bv_page = NULL; } for (i=0 ; i < pi->raid_disks; i++) bio_put(r1bio->bios[i]); r1bio_pool_free(r1bio, data);}static void put_all_bios(conf_t *conf, r1bio_t *r1_bio){ int i; for (i = 0; i < conf->raid_disks; i++) { struct bio **bio = r1_bio->bios + i; if (*bio) bio_put(*bio); *bio = NULL; }}static inline void free_r1bio(r1bio_t *r1_bio){ unsigned long flags; conf_t *conf = mddev_to_conf(r1_bio->mddev); /* * Wake up any possible resync thread that waits for the device * to go idle. */ spin_lock_irqsave(&conf->resync_lock, flags); if (!--conf->nr_pending) { wake_up(&conf->wait_idle); wake_up(&conf->wait_resume); } spin_unlock_irqrestore(&conf->resync_lock, flags); put_all_bios(conf, r1_bio); mempool_free(r1_bio, conf->r1bio_pool);}static inline void put_buf(r1bio_t *r1_bio){ conf_t *conf = mddev_to_conf(r1_bio->mddev); unsigned long flags; mempool_free(r1_bio, conf->r1buf_pool); spin_lock_irqsave(&conf->resync_lock, flags); if (!conf->barrier) BUG(); --conf->barrier; wake_up(&conf->wait_resume); wake_up(&conf->wait_idle); if (!--conf->nr_pending) { wake_up(&conf->wait_idle); wake_up(&conf->wait_resume); } spin_unlock_irqrestore(&conf->resync_lock, flags);}static void reschedule_retry(r1bio_t *r1_bio){ unsigned long flags; mddev_t *mddev = r1_bio->mddev; spin_lock_irqsave(&retry_list_lock, flags); list_add(&r1_bio->retry_list, &retry_list_head); spin_unlock_irqrestore(&retry_list_lock, flags); md_wakeup_thread(mddev->thread);}/* * raid_end_bio_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */static void raid_end_bio_io(r1bio_t *r1_bio){ struct bio *bio = r1_bio->master_bio; bio_endio(bio, bio->bi_size, test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); free_r1bio(r1_bio);}/* * Update disk head position estimator based on IRQ completion info. */static inline void update_head_pos(int disk, r1bio_t *r1_bio){ conf_t *conf = mddev_to_conf(r1_bio->mddev); conf->mirrors[disk].head_position = r1_bio->sector + (r1_bio->sectors);}static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error){ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror; conf_t *conf = mddev_to_conf(r1_bio->mddev); if (bio->bi_size) return 1; mirror = r1_bio->read_disk; /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); else /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher * levels even if IO on some other mirrored buffer fails. * * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio. */ set_bit(R1BIO_Uptodate, &r1_bio->state); update_head_pos(mirror, r1_bio); /* * we have only one bio on the read side */ if (uptodate) raid_end_bio_io(r1_bio); else { /* * oops, read error: */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); reschedule_retry(r1_bio); } rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); return 0;}static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error){ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror; conf_t *conf = mddev_to_conf(r1_bio->mddev); if (bio->bi_size) return 1; for (mirror = 0; mirror < conf->raid_disks; mirror++) if (r1_bio->bios[mirror] == bio) break; /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); else /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher * levels even if IO on some other mirrored buffer fails. * * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio. */ set_bit(R1BIO_Uptodate, &r1_bio->state); update_head_pos(mirror, r1_bio); /* * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); return 0;}/* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented. */static int read_balance(conf_t *conf, r1bio_t *r1_bio){ const unsigned long this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; const int sectors = r1_bio->sectors; sector_t new_distance, current_distance; spin_lock_irq(&conf->device_lock); /* * Check if it if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. * We take the first readable disk when above the resync window. */ if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { /* Choose the first operation device, for consistancy */ new_disk = 0; while (!conf->mirrors[new_disk].rdev || !conf->mirrors[new_disk].rdev->in_sync) { new_disk++; if (new_disk == conf->raid_disks) { new_disk = -1; break; } } goto rb_out; } /* make sure the disk is operational */ while (!conf->mirrors[new_disk].rdev || !conf->mirrors[new_disk].rdev->in_sync) { if (new_disk <= 0) new_disk = conf->raid_disks; new_disk--; if (new_disk == disk) { new_disk = -1; goto rb_out; } } disk = new_disk; /* now disk == new_disk == starting point for search */ /* * Don't change to another disk for sequential reads: */ if (conf->next_seq_sect == this_sector) goto rb_out; if (this_sector == conf->mirrors[new_disk].head_position) goto rb_out; current_distance = abs(this_sector - conf->mirrors[disk].head_position); /* Find the disk whose head is closest */ do { if (disk <= 0) disk = conf->raid_disks; disk--; if (!conf->mirrors[disk].rdev || !conf->mirrors[disk].rdev->in_sync) continue; if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) { new_disk = disk; break; } new_distance = abs(this_sector - conf->mirrors[disk].head_position); if (new_distance < current_distance) { current_distance = new_distance; new_disk = disk; } } while (disk != conf->last_used);rb_out: if (new_disk >= 0) { conf->next_seq_sect = this_sector + sectors; conf->last_used = new_disk; atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending); } spin_unlock_irq(&conf->device_lock); return new_disk;}static void unplug_slaves(mddev_t *mddev){ conf_t *conf = mddev_to_conf(mddev); int i; unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); for (i=0; i<mddev->raid_disks; i++) { mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); spin_unlock_irqrestore(&conf->device_lock, flags); if (r_queue->unplug_fn) r_queue->unplug_fn(r_queue); spin_lock_irqsave(&conf->device_lock, flags); atomic_dec(&rdev->nr_pending); } } spin_unlock_irqrestore(&conf->device_lock, flags);}static void raid1_unplug(request_queue_t *q){ unplug_slaves(q->queuedata);}static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, sector_t *error_sector){ mddev_t *mddev = q->queuedata; conf_t *conf = mddev_to_conf(mddev); unsigned long flags; int i, ret = 0; spin_lock_irqsave(&conf->device_lock, flags); for (i=0; i<mddev->raid_disks; i++) { mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && !rdev->faulty) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); if (r_queue->issue_flush_fn) { ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); if (ret) break; } } } spin_unlock_irqrestore(&conf->device_lock, flags);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?