raid10.c
来自「linux 内核源代码」· C语言 代码 · 共 2,188 行 · 第 1/4 页
C
2,188 行
/* * raid10.c : Multiple Devices driver for Linux * * Copyright (C) 2000-2004 Neil Brown * * RAID-10 support for md. * * Base on code in raid1.c. See raid1.c for futher copyright information. * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include "dm-bio-list.h"#include <linux/raid/raid10.h>#include <linux/raid/bitmap.h>/* * RAID10 provides a combination of RAID0 and RAID1 functionality. * The layout of data is defined by * chunk_size * raid_disks * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) * * The data to be stored is divided into chunks using chunksize. * Each device is divided into far_copies sections. * In each section, chunks are laid out in a style similar to raid0, but * near_copies copies of each chunk is stored (each on a different drive). * The starting device for each section is offset near_copies from the starting * device of the previous section. * Thus they are (near_copies*far_copies) of each chunk, and each is on a different * drive. * near_copies and far_copies must be at least one, and their product is at most * raid_disks. * * If far_offset is true, then the far_copies are handled a bit differently. * The copies are still in different stripes, but instead of be very far apart * on disk, there are adjacent stripes. *//* * Number of guaranteed r10bios in case of extreme VM load: */#define NR_RAID10_BIOS 256static void unplug_slaves(mddev_t *mddev);static void allow_barrier(conf_t *conf);static void lower_barrier(conf_t *conf);static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data){ conf_t *conf = data; r10bio_t *r10_bio; int size = offsetof(struct r10bio_s, devs[conf->copies]); /* allocate a r10bio with room for raid_disks entries in the bios array */ r10_bio = kzalloc(size, gfp_flags); if (!r10_bio) unplug_slaves(conf->mddev); return r10_bio;}static void r10bio_pool_free(void *r10_bio, void *data){ kfree(r10_bio);}#define RESYNC_BLOCK_SIZE (64*1024)//#define RESYNC_BLOCK_SIZE PAGE_SIZE#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)#define RESYNC_WINDOW (2048*1024)/* * When performing a resync, we need to read and compare, so * we need as many pages are there are copies. * When performing a recovery, we need 2 bios, one for read, * one for write (we recover only one drive per r10buf) * */static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data){ conf_t *conf = data; struct page *page; r10bio_t *r10_bio; struct bio *bio; int i, j; int nalloc; r10_bio = r10bio_pool_alloc(gfp_flags, conf); if (!r10_bio) { unplug_slaves(conf->mddev); return NULL; } if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) nalloc = conf->copies; /* resync */ else nalloc = 2; /* recovery */ /* * Allocate bios. */ for (j = nalloc ; j-- ; ) { bio = bio_alloc(gfp_flags, RESYNC_PAGES); if (!bio) goto out_free_bio; r10_bio->devs[j].bio = bio; } /* * Allocate RESYNC_PAGES data pages and attach them * where needed. */ for (j = 0 ; j < nalloc; j++) { bio = r10_bio->devs[j].bio; for (i = 0; i < RESYNC_PAGES; i++) { page = alloc_page(gfp_flags); if (unlikely(!page)) goto out_free_pages; bio->bi_io_vec[i].bv_page = page; } } return r10_bio;out_free_pages: for ( ; i > 0 ; i--) safe_put_page(bio->bi_io_vec[i-1].bv_page); while (j--) for (i = 0; i < RESYNC_PAGES ; i++) safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); j = -1;out_free_bio: while ( ++j < nalloc ) bio_put(r10_bio->devs[j].bio); r10bio_pool_free(r10_bio, conf); return NULL;}static void r10buf_pool_free(void *__r10_bio, void *data){ int i; conf_t *conf = data; r10bio_t *r10bio = __r10_bio; int j; for (j=0; j < conf->copies; j++) { struct bio *bio = r10bio->devs[j].bio; if (bio) { for (i = 0; i < RESYNC_PAGES; i++) { safe_put_page(bio->bi_io_vec[i].bv_page); bio->bi_io_vec[i].bv_page = NULL; } bio_put(bio); } } r10bio_pool_free(r10bio, conf);}static void put_all_bios(conf_t *conf, r10bio_t *r10_bio){ int i; for (i = 0; i < conf->copies; i++) { struct bio **bio = & r10_bio->devs[i].bio; if (*bio && *bio != IO_BLOCKED) bio_put(*bio); *bio = NULL; }}static void free_r10bio(r10bio_t *r10_bio){ conf_t *conf = mddev_to_conf(r10_bio->mddev); /* * Wake up any possible resync thread that waits for the device * to go idle. */ allow_barrier(conf); put_all_bios(conf, r10_bio); mempool_free(r10_bio, conf->r10bio_pool);}static void put_buf(r10bio_t *r10_bio){ conf_t *conf = mddev_to_conf(r10_bio->mddev); mempool_free(r10_bio, conf->r10buf_pool); lower_barrier(conf);}static void reschedule_retry(r10bio_t *r10_bio){ unsigned long flags; mddev_t *mddev = r10_bio->mddev; conf_t *conf = mddev_to_conf(mddev); spin_lock_irqsave(&conf->device_lock, flags); list_add(&r10_bio->retry_list, &conf->retry_list); conf->nr_queued ++; spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread);}/* * raid_end_bio_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */static void raid_end_bio_io(r10bio_t *r10_bio){ struct bio *bio = r10_bio->master_bio; bio_endio(bio, test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); free_r10bio(r10_bio);}/* * Update disk head position estimator based on IRQ completion info. */static inline void update_head_pos(int slot, r10bio_t *r10_bio){ conf_t *conf = mddev_to_conf(r10_bio->mddev); conf->mirrors[r10_bio->devs[slot].devnum].head_position = r10_bio->devs[slot].addr + (r10_bio->sectors);}static void raid10_end_read_request(struct bio *bio, int error){ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); int slot, dev; conf_t *conf = mddev_to_conf(r10_bio->mddev); slot = r10_bio->read_slot; dev = r10_bio->devs[slot].devnum; /* * this branch is our 'one mirror IO has finished' event handler: */ update_head_pos(slot, r10_bio); if (uptodate) { /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code to the higher * levels even if IO on some other mirrored buffer fails. * * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio. */ set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } else { /* * oops, read error: */ char b[BDEVNAME_SIZE]; if (printk_ratelimit()) printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); reschedule_retry(r10_bio); } rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);}static void raid10_end_write_request(struct bio *bio, int error){ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); int slot, dev; conf_t *conf = mddev_to_conf(r10_bio->mddev); for (slot = 0; slot < conf->copies; slot++) if (r10_bio->devs[slot].bio == bio) break; dev = r10_bio->devs[slot].devnum; /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) { md_error(r10_bio->mddev, conf->mirrors[dev].rdev); /* an I/O failed, we can't clear the bitmap */ set_bit(R10BIO_Degraded, &r10_bio->state); } else /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher * levels even if IO on some other mirrored buffer fails. * * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio. */ set_bit(R10BIO_Uptodate, &r10_bio->state); update_head_pos(slot, r10_bio); /* * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r10_bio->remaining)) { /* clear the bitmap if all writes complete successfully */ bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, r10_bio->sectors, !test_bit(R10BIO_Degraded, &r10_bio->state), 0); md_write_end(r10_bio->mddev); raid_end_bio_io(r10_bio); } rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);}/* * RAID10 layout manager * Aswell as the chunksize and raid_disks count, there are two * parameters: near_copies and far_copies. * near_copies * far_copies must be <= raid_disks. * Normally one of these will be 1. * If both are 1, we get raid0. * If near_copies == raid_disks, we get raid1. * * Chunks are layed out in raid0 style with near_copies copies of the * first chunk, followed by near_copies copies of the next chunk and * so on. * If far_copies > 1, then after 1/far_copies of the array has been assigned * as described above, we start again with a device offset of near_copies. * So we effectively have another copy of the whole array further down all * the drives, but with blocks on different drives. * With this layout, and block is never stored twice on the one device. * * raid10_find_phys finds the sector offset of a given virtual sector * on each device that it is on. * * raid10_find_virt does the reverse mapping, from a device and a * sector offset to a virtual address */static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio){ int n,f; sector_t sector; sector_t chunk; sector_t stripe; int dev; int slot = 0; /* now calculate first sector/dev */ chunk = r10bio->sector >> conf->chunk_shift; sector = r10bio->sector & conf->chunk_mask; chunk *= conf->near_copies; stripe = chunk; dev = sector_div(stripe, conf->raid_disks); if (conf->far_offset) stripe *= conf->far_copies; sector += stripe << conf->chunk_shift; /* and calculate all the others */ for (n=0; n < conf->near_copies; n++) { int d = dev; sector_t s = sector; r10bio->devs[slot].addr = sector; r10bio->devs[slot].devnum = d; slot++; for (f = 1; f < conf->far_copies; f++) { d += conf->near_copies; if (d >= conf->raid_disks) d -= conf->raid_disks; s += conf->stride; r10bio->devs[slot].devnum = d; r10bio->devs[slot].addr = s; slot++; } dev++; if (dev >= conf->raid_disks) { dev = 0; sector += (conf->chunk_mask + 1); } } BUG_ON(slot != conf->copies);}static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev){ sector_t offset, chunk, vchunk; offset = sector & conf->chunk_mask; if (conf->far_offset) { int fc; chunk = sector >> conf->chunk_shift; fc = sector_div(chunk, conf->far_copies); dev -= fc * conf->near_copies; if (dev < 0) dev += conf->raid_disks; } else { while (sector >= conf->stride) { sector -= conf->stride; if (dev < conf->near_copies) dev += conf->raid_disks - conf->near_copies; else dev -= conf->near_copies; } chunk = sector >> conf->chunk_shift; } vchunk = chunk * conf->raid_disks + dev; sector_div(vchunk, conf->near_copies); return (vchunk << conf->chunk_shift) + offset;}/** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue * @bio: the buffer head that's been built up so far * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset * If near_copies == raid_disk, there are no striping issues, * but in that case, the function isn't called at all. */static int raid10_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *bio_vec){ mddev_t *mddev = q->queuedata; sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; unsigned int bio_sectors = bio->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ if (max <= bio_vec->bv_len && bio_sectors == 0) return bio_vec->bv_len; else return max;}/* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented. *//* * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry. */static int read_balance(conf_t *conf, r10bio_t *r10_bio){ const unsigned long this_sector = r10_bio->sector; int disk, slot, nslot; const int sectors = r10_bio->sectors; sector_t new_distance, current_distance; mdk_rdev_t *rdev; raid10_find_phys(conf, r10_bio); rcu_read_lock(); /* * Check if we can balance. We can balance on the whole * device if no resync is going on (recovery is ok), or below * the resync window. We take the first readable disk when * above the resync window. */ if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { /* make sure that disk is operational */ slot = 0; disk = r10_bio->devs[slot].devnum; while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || r10_bio->devs[slot].bio == IO_BLOCKED || !test_bit(In_sync, &rdev->flags)) { slot++; if (slot == conf->copies) { slot = 0; disk = -1; break; } disk = r10_bio->devs[slot].devnum; } goto rb_out; } /* make sure the disk is operational */ slot = 0; disk = r10_bio->devs[slot].devnum; while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || r10_bio->devs[slot].bio == IO_BLOCKED || !test_bit(In_sync, &rdev->flags)) { slot ++; if (slot == conf->copies) { disk = -1; goto rb_out; } disk = r10_bio->devs[slot].devnum; } current_distance = abs(r10_bio->devs[slot].addr - conf->mirrors[disk].head_position); /* Find the disk whose head is closest */ for (nslot = slot; nslot < conf->copies; nslot++) { int ndisk = r10_bio->devs[nslot].devnum; if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || r10_bio->devs[nslot].bio == IO_BLOCKED ||
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?