raid6main.c
来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 2,095 行 · 第 1/4 页
C
2,095 行
* need to be failed */ if (failed > 2 && to_read+to_write+written) { spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; if (bi) to_write--; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); if (--bi->bi_phys_segments == 0) { md_write_end(conf->mddev); bi->bi_next = return_bi; return_bi = bi; } bi = nextbi; } /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); if (--bi->bi_phys_segments == 0) { md_write_end(conf->mddev); bi->bi_next = return_bi; return_bi = bi; } bi = bi2; } /* fail any reads if this device is non-operational */ if (!test_bit(R5_Insync, &sh->dev[i].flags)) { bi = sh->dev[i].toread; sh->dev[i].toread = NULL; if (bi) to_read--; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); if (--bi->bi_phys_segments == 0) { bi->bi_next = return_bi; return_bi = bi; } bi = nextbi; } } } spin_unlock_irq(&conf->device_lock); } if (failed > 2 && syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); syncing = 0; } /* * might be able to return some write requests if the parity blocks * are safe, or on a failed drive */ pdev = &sh->dev[pd_idx]; p_failed = (failed >= 1 && failed_num[0] == pd_idx) || (failed >= 2 && failed_num[1] == pd_idx); qdev = &sh->dev[qd_idx]; q_failed = (failed >= 1 && failed_num[0] == qd_idx) || (failed >= 2 && failed_num[1] == qd_idx); if ( written && ( p_failed || ((test_bit(R5_Insync, &pdev->flags) && !test_bit(R5_LOCKED, &pdev->flags) && test_bit(R5_UPTODATE, &pdev->flags))) ) && ( q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) && test_bit(R5_UPTODATE, &qdev->flags))) ) ) { /* any written block on an uptodate or failed drive can be * returned. Note that if we 'wrote' to a failed drive, * it will be UPTODATE, but never LOCKED, so we don't need * to test 'failed' directly. */ for (i=disks; i--; ) if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags) ) { /* We can return any write requests */ struct bio *wbi, *wbi2; PRINTK("Return write for stripe %llu disc %d\n", (unsigned long long)sh->sector, i); spin_lock_irq(&conf->device_lock); wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); if (--wbi->bi_phys_segments == 0) { md_write_end(conf->mddev); wbi->bi_next = return_bi; return_bi = wbi; } wbi = wbi2; } spin_unlock_irq(&conf->device_lock); } } } /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. */ if (to_read || non_overwrite || (syncing && (uptodate < disks))) { for (i=disks; i--;) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || syncing || (failed >= 1 && (sh->dev[failed_num[0]].toread || (sh->dev[failed_num[0]].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num[0]].flags)))) || (failed >= 2 && (sh->dev[failed_num[1]].toread || (sh->dev[failed_num[1]].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num[1]].flags)))) ) ) { /* we would like to get this block, possibly * by computing it, but we might not be able to */ if (uptodate == disks-1) { PRINTK("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i); uptodate++; } else if ( uptodate == disks-2 && failed >= 2 ) { /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ int other; for (other=disks; other--;) { if ( other == i ) continue; if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) ) break; } BUG_ON(other < 0); PRINTK("Computing stripe %llu blocks %d,%d\n", (unsigned long long)sh->sector, i, other); compute_block_2(sh, i, other); uptodate += 2; } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags);#if 0 /* if I am just reading this block and we don't have a failed drive, or any pending writes then sidestep the cache */ if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && ! syncing && !failed && !to_write) { sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; }#endif locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); if (syncing) md_sync_acct(conf->disks[i].rdev->bdev, STRIPE_SECTORS); } } } set_bit(STRIPE_HANDLE, &sh->state); } /* now to consider writing and what else, if anything should be read */ if (to_write) { int rcw=0, must_compute=0; for (i=disks ; i--;) { dev = &sh->dev[i]; /* Would I have to read this buffer for reconstruct_write */ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != pd_idx && i != qd_idx && (!test_bit(R5_LOCKED, &dev->flags)#if 0 || sh->bh_page[i] != bh->b_page#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; else { PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags); must_compute++; } } } PRINTK("for sector %llu, rcw=%d, must_compute=%d\n", (unsigned long long)sh->sector, rcw, must_compute); set_bit(STRIPE_HANDLE, &sh->state); if (rcw > 0) /* want reconstruct write, but need to get some data */ for (i=disks; i--;) { dev = &sh->dev[i]; if (!test_bit(R5_OVERWRITE, &dev->flags) && !(failed == 0 && (i == pd_idx || i == qd_idx)) && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && test_bit(R5_Insync, &dev->flags)) { if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { PRINTK("Read_old stripe %llu block %d for Reconstruct\n", (unsigned long long)sh->sector, i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); locked++; } else { PRINTK("Request delayed stripe %llu block %d for Reconstruct\n", (unsigned long long)sh->sector, i); set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } } } /* now if nothing is locked, and if we have enough data, we can start a write request */ if (locked == 0 && rcw == 0) { if ( must_compute > 0 ) { /* We have failed blocks and need to compute them */ switch ( failed ) { case 0: BUG(); case 1: compute_block_1(sh, failed_num[0]); break; case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; default: BUG(); /* This request should have been failed? */ } } PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector); compute_parity(sh, RECONSTRUCT_WRITE); /* now every locked buffer is ready to be written */ for (i=disks; i--;) if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { PRINTK("Writing stripe %llu block %d\n", (unsigned long long)sh->sector, i); locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags);#if 0 /**** FIX: I don't understand the logic here... ****/ if (!test_bit(R5_Insync, &sh->dev[i].flags) || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */ set_bit(STRIPE_INSYNC, &sh->state);#endif } if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); } } } /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough data * is available */ if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { set_bit(STRIPE_HANDLE, &sh->state);#if 0 /* RAID-6: Don't support CHECK PARITY yet */ if (failed == 0) { char *pagea; if (uptodate != disks) BUG(); compute_parity(sh, CHECK_PARITY); uptodate--; pagea = page_address(sh->dev[pd_idx].page); if ((*(u32*)pagea) == 0 && !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { /* parity is correct (on disc, not in buffer any more) */ set_bit(STRIPE_INSYNC, &sh->state); } }#endif if (!test_bit(STRIPE_INSYNC, &sh->state)) { int failed_needupdate[2]; struct r5dev *adev, *bdev; if ( failed < 1 ) failed_num[0] = pd_idx; if ( failed < 2 ) failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx; failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags); PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]);#if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ /* should be able to compute the missing block(s) and write to spare */ if ( failed_needupdate[0] ^ failed_needupdate[1] ) { if (uptodate+1 != disks) BUG(); compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); uptodate++; } else if ( failed_needupdate[0] & failed_needupdate[1] ) { if (uptodate+2 != disks) BUG(); compute_block_2(sh, failed_num[0], failed_num[1]); uptodate += 2; }#else compute_block_2(sh, failed_num[0], failed_num[1]); uptodate += failed_needupdate[0] + failed_needupdate[1];#endif if (uptodate != disks) BUG(); PRINTK("Marking for sync stripe %llu blocks %d,%d\n", (unsigned long long)sh->sector, failed_num[0], failed_num[1]); /**** FIX: Should we really do both of these unconditionally? ****/ adev = &sh->dev[failed_num[0]]; locked += !test_bit(R5_LOCKED, &adev->flags); set_bit(R5_LOCKED, &adev->flags); set_bit(R5_Wantwrite, &adev->flags); bdev = &sh->dev[failed_num[1]]; locked += !test_bit(R5_LOCKED, &bdev->flags); set_bit(R5_LOCKED, &bdev->flags); set_bit(R5_Wantwrite, &bdev->flags); set_bit(STRIPE_INSYNC, &sh->state); set_bit(R5_Syncio, &adev->flags); set_bit(R5_Syncio, &bdev->flags); } } if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS,1); clear_bit(STRIPE_SYNCING, &sh->state); } spin_unlock(&sh->lock); while ((bi=return_bi)) { int bytes = bi->bi_size; return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; bi->bi_end_io(bi, bytes, 0); } for (i=disks; i-- ;) { int rw; struct bio *bi; mdk_rdev_t *rdev; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) rw = 1; else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = 0; else continue; bi = &sh->dev[i].req; bi->bi_rw = rw; if (rw) bi->bi_end_io = raid6_end_write_request; else bi->bi_end_io = raid6_end_read_request; spin_lock_irq(&conf->device_lock); rdev = conf->disks[i].rdev; if (rdev && rdev->faulty) rdev = NULL; if (rdev) atomic_inc(&rdev->nr_pending); spin_unlock_irq(&conf->device_lock); if (rdev) { if (test_bit(R5_Syncio, &sh->dev[i].flags)) md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; PRINTK("for %llu schedule op %ld on disc %d\n", (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); bi->bi_sector = sh->sector + rdev->data_offset; bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_vcnt = 1; bi->bi_idx = 0; bi->bi_io_vec = &sh->dev[i].vec; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; bi->bi_next = NULL; generic_make_request(bi); } else { PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); } }}static inline void raid6_activate_delayed(raid6_conf_t *conf){ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { while (!list_empty(&conf->delayed_list)) { struct list_head *l = conf->delayed_list.next; struct stripe_head *sh; sh = list_entry(l, struct stripe_head, lru); list_del_init(l); clear_bit(STRIPE_DELAYED, &sh->state); if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); list_add_tail(&sh->lru, &conf->handle_list); } }}static void unplug_slaves(mddev_t *mddev){ raid6_conf_t *conf = mddev_to_conf(mddev); int i; unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); for (i=0; i<mddev->raid_disks; i++) { mdk_rdev_t *rdev = conf->disks[i].rdev; if (rdev && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); spin_unlock_irqrestore(&conf->device_lock, flags); if (r_queue && r_queue->unplug_fn) r_queue->unplug_fn(r_queue); spin_lock_irqsave(&conf->device_lock, flags); atomic_dec(&rdev->nr_pending); } } spin_unlock_irqrestore(&conf->device_lock, flags);}static void raid6_unplug_device(request_queue_t *q){ mddev_t *mddev = q->queuedata; raid6_conf_t *conf = mddev_to_conf(mddev); unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); if (blk_remove_plug(q)) raid6_activate_delayed(conf); md_wakeup_thread(mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); unplug_slaves(mddev);}static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk, sector_t *error_sector){ mddev_t *mddev = q->queuedata; raid6_conf_t *conf = mddev_to_conf(mddev); int i, ret = 0; for (i=0; i<mddev->raid_disks; i++) { mdk_rdev_t *rdev = conf->disks[i].rdev; if (rdev && !rdev->faulty) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue; if (!bdev) continue; r_queue = bdev_get_queue(bdev); if (!r_queue) continue; if (!r_queue->issue_flush_fn) { ret = -EOPNOTSUPP; break; } ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); if (ret) break; } } return ret;}static inline void raid6_plug_device(raid6_conf_t *conf){ spin_lock_irq(&conf->device_lock); blk_plug_device(conf->mddev->queue); spin_unlock_irq(&conf->device_lock);}static int make_request (request_queue_t *q, struct bio * bi){ mddev_t *mddev = q->queuedata; raid6_conf_t *conf = mddev_to_conf(mddev); const unsigned int raid_disks = conf->raid_disks; const unsigned int data_disks = raid_disks - 2; unsigned int dd_idx, pd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; if (bio_data_dir(bi)==WRITE) { disk_stat_inc(mddev->gendisk, writes); disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); } else { disk_stat_inc(mddev->gendisk, reads); disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi)); } logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ if ( bio_data_dir(bi) == WRITE ) md_write_start(mddev); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?