raid1.c
来自「linux 内核源代码」· C语言 代码 · 共 2,200 行 · 第 1/4 页
C
2,200 行
/* * raid1.c : Multiple Devices driver for Linux * * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat * * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman * * RAID-1 management functions. * * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 * * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> * * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support * bitmapped intelligence in resync: * * - bitmap marked during normal i/o * - bitmap used to skip nondirty blocks during sync * * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: * - persistent bitmap code * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include "dm-bio-list.h"#include <linux/raid/raid1.h>#include <linux/raid/bitmap.h>#define DEBUG 0#if DEBUG#define PRINTK(x...) printk(x)#else#define PRINTK(x...)#endif/* * Number of guaranteed r1bios in case of extreme VM load: */#define NR_RAID1_BIOS 256static void unplug_slaves(mddev_t *mddev);static void allow_barrier(conf_t *conf);static void lower_barrier(conf_t *conf);static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data){ struct pool_info *pi = data; r1bio_t *r1_bio; int size = offsetof(r1bio_t, bios[pi->raid_disks]); /* allocate a r1bio with room for raid_disks entries in the bios array */ r1_bio = kzalloc(size, gfp_flags); if (!r1_bio) unplug_slaves(pi->mddev); return r1_bio;}static void r1bio_pool_free(void *r1_bio, void *data){ kfree(r1_bio);}#define RESYNC_BLOCK_SIZE (64*1024)//#define RESYNC_BLOCK_SIZE PAGE_SIZE#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)#define RESYNC_WINDOW (2048*1024)static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data){ struct pool_info *pi = data; struct page *page; r1bio_t *r1_bio; struct bio *bio; int i, j; r1_bio = r1bio_pool_alloc(gfp_flags, pi); if (!r1_bio) { unplug_slaves(pi->mddev); return NULL; } /* * Allocate bios : 1 for reading, n-1 for writing */ for (j = pi->raid_disks ; j-- ; ) { bio = bio_alloc(gfp_flags, RESYNC_PAGES); if (!bio) goto out_free_bio; r1_bio->bios[j] = bio; } /* * Allocate RESYNC_PAGES data pages and attach them to * the first bio. * If this is a user-requested check/repair, allocate * RESYNC_PAGES for each bio. */ if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) j = pi->raid_disks; else j = 1; while(j--) { bio = r1_bio->bios[j]; for (i = 0; i < RESYNC_PAGES; i++) { page = alloc_page(gfp_flags); if (unlikely(!page)) goto out_free_pages; bio->bi_io_vec[i].bv_page = page; } } /* If not user-requests, copy the page pointers to all bios */ if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { for (i=0; i<RESYNC_PAGES ; i++) for (j=1; j<pi->raid_disks; j++) r1_bio->bios[j]->bi_io_vec[i].bv_page = r1_bio->bios[0]->bi_io_vec[i].bv_page; } r1_bio->master_bio = NULL; return r1_bio;out_free_pages: for (i=0; i < RESYNC_PAGES ; i++) for (j=0 ; j < pi->raid_disks; j++) safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); j = -1;out_free_bio: while ( ++j < pi->raid_disks ) bio_put(r1_bio->bios[j]); r1bio_pool_free(r1_bio, data); return NULL;}static void r1buf_pool_free(void *__r1_bio, void *data){ struct pool_info *pi = data; int i,j; r1bio_t *r1bio = __r1_bio; for (i = 0; i < RESYNC_PAGES; i++) for (j = pi->raid_disks; j-- ;) { if (j == 0 || r1bio->bios[j]->bi_io_vec[i].bv_page != r1bio->bios[0]->bi_io_vec[i].bv_page) safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page); } for (i=0 ; i < pi->raid_disks; i++) bio_put(r1bio->bios[i]); r1bio_pool_free(r1bio, data);}static void put_all_bios(conf_t *conf, r1bio_t *r1_bio){ int i; for (i = 0; i < conf->raid_disks; i++) { struct bio **bio = r1_bio->bios + i; if (*bio && *bio != IO_BLOCKED) bio_put(*bio); *bio = NULL; }}static void free_r1bio(r1bio_t *r1_bio){ conf_t *conf = mddev_to_conf(r1_bio->mddev); /* * Wake up any possible resync thread that waits for the device * to go idle. */ allow_barrier(conf); put_all_bios(conf, r1_bio); mempool_free(r1_bio, conf->r1bio_pool);}static void put_buf(r1bio_t *r1_bio){ conf_t *conf = mddev_to_conf(r1_bio->mddev); int i; for (i=0; i<conf->raid_disks; i++) { struct bio *bio = r1_bio->bios[i]; if (bio->bi_end_io) rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); } mempool_free(r1_bio, conf->r1buf_pool); lower_barrier(conf);}static void reschedule_retry(r1bio_t *r1_bio){ unsigned long flags; mddev_t *mddev = r1_bio->mddev; conf_t *conf = mddev_to_conf(mddev); spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); conf->nr_queued ++; spin_unlock_irqrestore(&conf->device_lock, flags); wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread);}/* * raid_end_bio_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */static void raid_end_bio_io(r1bio_t *r1_bio){ struct bio *bio = r1_bio->master_bio; /* if nobody has done the final endio yet, do it now */ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", (bio_data_dir(bio) == WRITE) ? "write" : "read", (unsigned long long) bio->bi_sector, (unsigned long long) bio->bi_sector + (bio->bi_size >> 9) - 1); bio_endio(bio, test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); } free_r1bio(r1_bio);}/* * Update disk head position estimator based on IRQ completion info. */static inline void update_head_pos(int disk, r1bio_t *r1_bio){ conf_t *conf = mddev_to_conf(r1_bio->mddev); conf->mirrors[disk].head_position = r1_bio->sector + (r1_bio->sectors);}static void raid1_end_read_request(struct bio *bio, int error){ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror; conf_t *conf = mddev_to_conf(r1_bio->mddev); mirror = r1_bio->read_disk; /* * this branch is our 'one mirror IO has finished' event handler: */ update_head_pos(mirror, r1_bio); if (uptodate) set_bit(R1BIO_Uptodate, &r1_bio->state); else { /* If all other devices have failed, we want to return * the error upwards rather than fail the last device. * Here we redefine "uptodate" to mean "Don't want to retry" */ unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); if (r1_bio->mddev->degraded == conf->raid_disks || (r1_bio->mddev->degraded == conf->raid_disks-1 && !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) uptodate = 1; spin_unlock_irqrestore(&conf->device_lock, flags); } if (uptodate) raid_end_bio_io(r1_bio); else { /* * oops, read error: */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); reschedule_retry(r1_bio); } rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);}static void raid1_end_write_request(struct bio *bio, int error){ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); conf_t *conf = mddev_to_conf(r1_bio->mddev); struct bio *to_put = NULL; for (mirror = 0; mirror < conf->raid_disks; mirror++) if (r1_bio->bios[mirror] == bio) break; if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); set_bit(R1BIO_BarrierRetry, &r1_bio->state); r1_bio->mddev->barriers_work = 0; /* Don't rdev_dec_pending in this branch - keep it for the retry */ } else { /* * this branch is our 'one mirror IO has finished' event handler: */ r1_bio->bios[mirror] = NULL; to_put = bio; if (!uptodate) { md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); /* an I/O failed, we can't clear the bitmap */ set_bit(R1BIO_Degraded, &r1_bio->state); } else /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher * levels even if IO on some other mirrored buffer fails. * * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio. */ set_bit(R1BIO_Uptodate, &r1_bio->state); update_head_pos(mirror, r1_bio); if (behind) { if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) atomic_dec(&r1_bio->behind_remaining); /* In behind mode, we ACK the master bio once the I/O has safely * reached all non-writemostly disks. Setting the Returned bit * ensures that this gets done only once -- we don't ever want to * return -EIO here, instead we'll wait */ if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && test_bit(R1BIO_Uptodate, &r1_bio->state)) { /* Maybe we can return now */ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { struct bio *mbio = r1_bio->master_bio; PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", (unsigned long long) mbio->bi_sector, (unsigned long long) mbio->bi_sector + (mbio->bi_size >> 9) - 1); bio_endio(mbio, 0); } } } rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } /* * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) reschedule_retry(r1_bio); else { /* it really is the end of this request */ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { /* free extra copy of the data pages */ int i = bio->bi_vcnt; while (i--) safe_put_page(bio->bi_io_vec[i].bv_page); } /* clear the bitmap if all writes complete successfully */ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, r1_bio->sectors, !test_bit(R1BIO_Degraded, &r1_bio->state), behind); md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } } if (to_put) bio_put(to_put);}/* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented. */static int read_balance(conf_t *conf, r1bio_t *r1_bio){ const unsigned long this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; int wonly_disk = -1; const int sectors = r1_bio->sectors; sector_t new_distance, current_distance; mdk_rdev_t *rdev; rcu_read_lock(); /* * Check if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. * We take the first readable disk when above the resync window. */ retry: if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { /* Choose the first operation device, for consistancy */ new_disk = 0; for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); r1_bio->bios[new_disk] == IO_BLOCKED || !rdev || !test_bit(In_sync, &rdev->flags) || test_bit(WriteMostly, &rdev->flags); rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { if (rdev && test_bit(In_sync, &rdev->flags) && r1_bio->bios[new_disk] != IO_BLOCKED) wonly_disk = new_disk; if (new_disk == conf->raid_disks - 1) { new_disk = wonly_disk; break; } } goto rb_out; } /* make sure the disk is operational */ for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); r1_bio->bios[new_disk] == IO_BLOCKED || !rdev || !test_bit(In_sync, &rdev->flags) || test_bit(WriteMostly, &rdev->flags); rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { if (rdev && test_bit(In_sync, &rdev->flags) && r1_bio->bios[new_disk] != IO_BLOCKED) wonly_disk = new_disk; if (new_disk <= 0) new_disk = conf->raid_disks; new_disk--; if (new_disk == disk) { new_disk = wonly_disk; break; } } if (new_disk < 0) goto rb_out; disk = new_disk; /* now disk == new_disk == starting point for search */ /* * Don't change to another disk for sequential reads: */ if (conf->next_seq_sect == this_sector) goto rb_out; if (this_sector == conf->mirrors[new_disk].head_position) goto rb_out; current_distance = abs(this_sector - conf->mirrors[disk].head_position); /* Find the disk whose head is closest */ do { if (disk <= 0) disk = conf->raid_disks; disk--; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || !test_bit(In_sync, &rdev->flags) || test_bit(WriteMostly, &rdev->flags)) continue; if (!atomic_read(&rdev->nr_pending)) { new_disk = disk; break; } new_distance = abs(this_sector - conf->mirrors[disk].head_position); if (new_distance < current_distance) { current_distance = new_distance; new_disk = disk; } } while (disk != conf->last_used); rb_out: if (new_disk >= 0) { rdev = rcu_dereference(conf->mirrors[new_disk].rdev); if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); if (!test_bit(In_sync, &rdev->flags)) { /* cannot risk returning a device that failed * before we inc'ed nr_pending */ rdev_dec_pending(rdev, conf->mddev); goto retry; } conf->next_seq_sect = this_sector + sectors; conf->last_used = new_disk; } rcu_read_unlock(); return new_disk;}static void unplug_slaves(mddev_t *mddev){ conf_t *conf = mddev_to_conf(mddev); int i; rcu_read_lock(); for (i=0; i<mddev->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { struct request_queue *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); rcu_read_unlock();
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?