raid10.c
来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 1,781 行 · 第 1/4 页
C
1,781 行
mempool_destroy(conf->r10buf_pool); conf->r10buf_pool = NULL;}static int raid10_spare_active(mddev_t *mddev){ int i; conf_t *conf = mddev->private; mirror_info_t *tmp; spin_lock_irq(&conf->device_lock); /* * Find all non-in_sync disks within the RAID10 configuration * and mark them in_sync */ for (i = 0; i < conf->raid_disks; i++) { tmp = conf->mirrors + i; if (tmp->rdev && !tmp->rdev->faulty && !tmp->rdev->in_sync) { conf->working_disks++; mddev->degraded--; tmp->rdev->in_sync = 1; } } spin_unlock_irq(&conf->device_lock); print_conf(conf); return 0;}static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev){ conf_t *conf = mddev->private; int found = 0; int mirror; mirror_info_t *p; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ return 0; spin_lock_irq(&conf->device_lock); for (mirror=0; mirror < mddev->raid_disks; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { p->rdev = rdev; blk_queue_stack_limits(mddev->queue, rdev->bdev->bd_disk->queue); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) mddev->queue->max_sectors = (PAGE_SIZE>>9); p->head_position = 0; rdev->raid_disk = mirror; found = 1; break; } spin_unlock_irq(&conf->device_lock); print_conf(conf); return found;}static int raid10_remove_disk(mddev_t *mddev, int number){ conf_t *conf = mddev->private; int err = 1; mirror_info_t *p = conf->mirrors+ number; print_conf(conf); spin_lock_irq(&conf->device_lock); if (p->rdev) { if (p->rdev->in_sync || atomic_read(&p->rdev->nr_pending)) { err = -EBUSY; goto abort; } p->rdev = NULL; err = 0; } if (err) MD_BUG();abort: spin_unlock_irq(&conf->device_lock); print_conf(conf); return err;}static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error){ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); conf_t *conf = mddev_to_conf(r10_bio->mddev); int i,d; if (bio->bi_size) return 1; for (i=0; i<conf->copies; i++) if (r10_bio->devs[i].bio == bio) break; if (i == conf->copies) BUG(); update_head_pos(i, r10_bio); d = r10_bio->devs[i].devnum; if (!uptodate) md_error(r10_bio->mddev, conf->mirrors[d].rdev); /* for reconstruct, we always reschedule after a read. * for resync, only after all reads */ if (test_bit(R10BIO_IsRecover, &r10_bio->state) || atomic_dec_and_test(&r10_bio->remaining)) { /* we have read all the blocks, * do the comparison in process context in raid10d */ reschedule_retry(r10_bio); } rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); return 0;}static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error){ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); mddev_t *mddev = r10_bio->mddev; conf_t *conf = mddev_to_conf(mddev); int i,d; if (bio->bi_size) return 1; for (i = 0; i < conf->copies; i++) if (r10_bio->devs[i].bio == bio) break; d = r10_bio->devs[i].devnum; if (!uptodate) md_error(mddev, conf->mirrors[d].rdev); update_head_pos(i, r10_bio); while (atomic_dec_and_test(&r10_bio->remaining)) { if (r10_bio->master_bio == NULL) { /* the primary of several recovery bios */ md_done_sync(mddev, r10_bio->sectors, 1); put_buf(r10_bio); break; } else { r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; put_buf(r10_bio); r10_bio = r10_bio2; } } rdev_dec_pending(conf->mirrors[d].rdev, mddev); return 0;}/* * Note: sync and recover and handled very differently for raid10 * This code is for resync. * For resync, we read through virtual addresses and read all blocks. * If there is any error, we schedule a write. The lowest numbered * drive is authoritative. * However requests come for physical address, so we need to map. * For every physical address there are raid_disks/copies virtual addresses, * which is always are least one, but is not necessarly an integer. * This means that a physical address can span multiple chunks, so we may * have to submit multiple io requests for a single sync request. *//* * We check if all blocks are in-sync and only write to blocks that * aren't in sync */static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio){ conf_t *conf = mddev_to_conf(mddev); int i, first; struct bio *tbio, *fbio; atomic_set(&r10_bio->remaining, 1); /* find the first device with a block */ for (i=0; i<conf->copies; i++) if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) break; if (i == conf->copies) goto done; first = i; fbio = r10_bio->devs[i].bio; /* now find blocks with errors */ for (i=first+1 ; i < conf->copies ; i++) { int vcnt, j, d; if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) continue; /* We know that the bi_io_vec layout is the same for * both 'first' and 'i', so we just compare them. * All vec entries are PAGE_SIZE; */ tbio = r10_bio->devs[i].bio; vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); for (j = 0; j < vcnt; j++) if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), page_address(tbio->bi_io_vec[j].bv_page), PAGE_SIZE)) break; if (j == vcnt) continue; /* Ok, we need to write this bio * First we need to fixup bv_offset, bv_len and * bi_vecs, as the read request might have corrupted these */ tbio->bi_vcnt = vcnt; tbio->bi_size = r10_bio->sectors << 9; tbio->bi_idx = 0; tbio->bi_phys_segments = 0; tbio->bi_hw_segments = 0; tbio->bi_hw_front_size = 0; tbio->bi_hw_back_size = 0; tbio->bi_flags &= ~(BIO_POOL_MASK - 1); tbio->bi_flags |= 1 << BIO_UPTODATE; tbio->bi_next = NULL; tbio->bi_rw = WRITE; tbio->bi_private = r10_bio; tbio->bi_sector = r10_bio->devs[i].addr; for (j=0; j < vcnt ; j++) { tbio->bi_io_vec[j].bv_offset = 0; tbio->bi_io_vec[j].bv_len = PAGE_SIZE; memcpy(page_address(tbio->bi_io_vec[j].bv_page), page_address(fbio->bi_io_vec[j].bv_page), PAGE_SIZE); } tbio->bi_end_io = end_sync_write; d = r10_bio->devs[i].devnum; atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); generic_make_request(tbio); }done: if (atomic_dec_and_test(&r10_bio->remaining)) { md_done_sync(mddev, r10_bio->sectors, 1); put_buf(r10_bio); }}/* * Now for the recovery code. * Recovery happens across physical sectors. * We recover all non-is_sync drives by finding the virtual address of * each, and then choose a working drive that also has that virt address. * There is a separate r10_bio for each non-in_sync drive. * Only the first two slots are in use. The first for reading, * The second for writing. * */static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio){ conf_t *conf = mddev_to_conf(mddev); int i, d; struct bio *bio, *wbio; /* move the pages across to the second bio * and submit the write request */ bio = r10_bio->devs[0].bio; wbio = r10_bio->devs[1].bio; for (i=0; i < wbio->bi_vcnt; i++) { struct page *p = bio->bi_io_vec[i].bv_page; bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; wbio->bi_io_vec[i].bv_page = p; } d = r10_bio->devs[1].devnum; atomic_inc(&conf->mirrors[d].rdev->nr_pending); md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); generic_make_request(wbio);}/* * This is a kernel thread which: * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. * 3. Performs writes following reads for array syncronising. */static void raid10d(mddev_t *mddev){ r10bio_t *r10_bio; struct bio *bio; unsigned long flags; conf_t *conf = mddev_to_conf(mddev); struct list_head *head = &conf->retry_list; int unplug=0; mdk_rdev_t *rdev; md_check_recovery(mddev); md_handle_safemode(mddev); for (;;) { char b[BDEVNAME_SIZE]; spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) break; r10_bio = list_entry(head->prev, r10bio_t, retry_list); list_del(head->prev); spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r10_bio->mddev; conf = mddev_to_conf(mddev); if (test_bit(R10BIO_IsSync, &r10_bio->state)) { sync_request_write(mddev, r10_bio); unplug = 1; } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) { recovery_request_write(mddev, r10_bio); unplug = 1; } else { int mirror; bio = r10_bio->devs[r10_bio->read_slot].bio; r10_bio->devs[r10_bio->read_slot].bio = NULL; mirror = read_balance(conf, r10_bio); r10_bio->devs[r10_bio->read_slot].bio = bio; if (mirror == -1) { printk(KERN_ALERT "raid10: %s: unrecoverable I/O" " read error for block %llu\n", bdevname(bio->bi_bdev,b), (unsigned long long)r10_bio->sector); raid_end_bio_io(r10_bio); } else { rdev = conf->mirrors[mirror].rdev; if (printk_ratelimit()) printk(KERN_ERR "raid10: %s: redirecting sector %llu to" " another mirror\n", bdevname(rdev->bdev,b), (unsigned long long)r10_bio->sector); bio->bi_bdev = rdev->bdev; bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr + rdev->data_offset; bio->bi_next = NULL; bio->bi_flags &= (1<<BIO_CLONED); bio->bi_flags |= 1 << BIO_UPTODATE; bio->bi_idx = 0; bio->bi_size = r10_bio->sectors << 9; bio->bi_rw = READ; unplug = 1; generic_make_request(bio); } } } spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev);}static int init_resync(conf_t *conf){ int buffs; buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; if (conf->r10buf_pool) BUG(); conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); if (!conf->r10buf_pool) return -ENOMEM; conf->next_resync = 0; return 0;}/* * perform a "sync" on one "block" * * We need to make sure that no normal I/O request - particularly write * requests - conflict with active sync requests. * * This is achieved by tracking pending requests and a 'barrier' concept * that can be installed to exclude normal IO requests. * * Resync and recovery are handled very differently. * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. * * For resync, we iterate over virtual addresses, read all copies, * and update if there are differences. If only one copy is live, * skip it. * For recovery, we iterate over physical addresses, read a good * value for each non-in_sync drive, and over-write. * * So, for recovery we may have several outstanding complex requests for a * given address, one for each out-of-sync device. We model this by allocating * a number of r10_bio structures, one for each out-of-sync device. * As we setup these structures, we collect all bio's together into a list * which we then process collectively to add pages, and then process again * to pass to generic_make_request. * * The r10_bio structures are linked using a borrowed master_bio pointer. * This link is counted in ->remaining. When the r10_bio that points to NULL * has its remaining count decremented to 0, the whole complex operation * is complete. * */static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster){ conf_t *conf = mddev_to_conf(mddev); r10bio_t *r10_bio; struct bio *biolist = NULL, *bio; sector_t max_sector, nr_sectors; int disk; int i; sector_t sectors_skipped = 0; int chunks_skipped = 0; if (!conf->r10buf_pool) if (init_resync(conf)) return -ENOMEM; skipped: max_sector = mddev->size << 1; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { close_sync(conf);
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?