raid5.c
来自「linux 内核源代码」· C语言 代码 · 共 2,325 行 · 第 1/5 页
C
2,325 行
}static voidhandle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, struct bio **return_bi){ int i; for (i = disks; i--; ) { struct bio *bi; int bitmap_end = 0; if (test_bit(R5_ReadError, &sh->dev[i].flags)) { mdk_rdev_t *rdev; rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) /* multiple read failures in one stripe */ md_error(conf->mddev, rdev); rcu_read_unlock(); } spin_lock_irq(&conf->device_lock); /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; if (bi) { s->to_write--; bitmap_end = 1; } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); if (--bi->bi_phys_segments == 0) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; } bi = nextbi; } /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; if (bi) bitmap_end = 1; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); if (--bi->bi_phys_segments == 0) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; } bi = bi2; } /* fail any reads if this device is non-operational and * the data has not reached the cache yet. */ if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && (!test_bit(R5_Insync, &sh->dev[i].flags) || test_bit(R5_ReadError, &sh->dev[i].flags))) { bi = sh->dev[i].toread; sh->dev[i].toread = NULL; if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); if (bi) s->to_read--; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); if (--bi->bi_phys_segments == 0) { bi->bi_next = *return_bi; *return_bi = bi; } bi = nextbi; } } spin_unlock_irq(&conf->device_lock); if (bitmap_end) bitmap_endwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, 0, 0); }}/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks * to process */static int __handle_issuing_new_read_requests5(struct stripe_head *sh, struct stripe_head_state *s, int disk_idx, int disks){ struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *failed_dev = &sh->dev[s->failed_num]; /* don't schedule compute operations or reads on the parity block while * a check is in flight */ if ((disk_idx == sh->pd_idx) && test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) return ~0; /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || s->syncing || s->expanding || (s->failed && (failed_dev->toread || (failed_dev->towrite && !test_bit(R5_OVERWRITE, &failed_dev->flags) ))))) { /* 1/ We would like to get this block, possibly by computing it, * but we might not be able to. * * 2/ Since parity check operations potentially make the parity * block !uptodate it will need to be refreshed before any * compute operations on data disks are scheduled. * * 3/ We hold off parity block re-reads until check operations * have quiesced. */ if ((s->uptodate == disks - 1) && !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; s->req_compute = 1; sh->ops.count++; /* Careful: from this point on 'uptodate' is in the eye * of raid5_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will * be R5_UPTODATE by the time it is needed for a * subsequent operation. */ s->uptodate++; return 0; /* uptodate + compute == disks */ } else if ((s->uptodate < disks - 1) && test_bit(R5_Insync, &dev->flags)) { /* Note: we hold off compute operations while checks are * in flight, but we still prefer 'compute' over 'read' * hence we only read if (uptodate < * disks-1) */ set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; s->locked++; pr_debug("Reading block %d (sync=%d)\n", disk_idx, s->syncing); } } return ~0;}static void handle_issuing_new_read_requests5(struct stripe_head *sh, struct stripe_head_state *s, int disks){ int i; /* Clear completed compute operations. Parity recovery * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled * later on in this routine */ if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); } /* look for blocks to read/compute, skip this if a compute * is already in flight, or if the stripe contents are in the * midst of changing due to a write */ if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { for (i = disks; i--; ) if (__handle_issuing_new_read_requests5( sh, s, i, disks) == 0) break; } set_bit(STRIPE_HANDLE, &sh->state);}static void handle_issuing_new_read_requests6(struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks){ int i; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || s->syncing || s->expanding || (s->failed >= 1 && (sh->dev[r6s->failed_num[0]].toread || s->to_write)) || (s->failed >= 2 && (sh->dev[r6s->failed_num[1]].toread || s->to_write)))) { /* we would like to get this block, possibly * by computing it, but we might not be able to */ if (s->uptodate == disks-1) { pr_debug("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i, 0); s->uptodate++; } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { /* Computing 2-failure is *very* expensive; only * do it if failed >= 2 */ int other; for (other = disks; other--; ) { if (other == i) continue; if (!test_bit(R5_UPTODATE, &sh->dev[other].flags)) break; } BUG_ON(other < 0); pr_debug("Computing stripe %llu blocks %d,%d\n", (unsigned long long)sh->sector, i, other); compute_block_2(sh, i, other); s->uptodate += 2; } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; pr_debug("Reading block %d (sync=%d)\n", i, s->syncing); } } } set_bit(STRIPE_HANDLE, &sh->state);}/* handle_completed_write_requests * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * never LOCKED, so we don't need to test 'failed' directly. */static void handle_completed_write_requests(raid5_conf_t *conf, struct stripe_head *sh, int disks, struct bio **return_bi){ int i; struct r5dev *dev; for (i = disks; i--; ) if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) { /* We can return any write requests */ struct bio *wbi, *wbi2; int bitmap_end = 0; pr_debug("Return write for disc %d\n", i); spin_lock_irq(&conf->device_lock); wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); if (--wbi->bi_phys_segments == 0) { md_write_end(conf->mddev); wbi->bi_next = *return_bi; *return_bi = wbi; } wbi = wbi2; } if (dev->towrite == NULL) bitmap_end = 1; spin_unlock_irq(&conf->device_lock); if (bitmap_end) bitmap_endwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, !test_bit(STRIPE_DEGRADED, &sh->state), 0); } }}static void handle_issuing_new_write_requests5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks){ int rmw = 0, rcw = 0, i; for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rmw++; else rmw += 2*disks; /* cannot read it */ } /* Would I have to read this buffer for reconstruct_write */ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rcw++; else rcw += 2*disks; } } pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); if (rmw < rcw && rmw > 0) /* prefer read-modify-write, but need to get some data */ for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { pr_debug("Read_old block " "%d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); if (!test_and_set_bit( STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } } } if (rcw <= rmw && rcw > 0) /* want reconstruct write, but need to get some data */ for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { pr_debug("Read_old block " "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); if (!test_and_set_bit( STRIPE_OP_IO, &sh->ops.pending)) sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } } } /* now if nothing is locked, and if we have enough data, * we can start a write request */ /* since handle_stripe can be called at any time we need to handle the * case where a compute block operation has been submitted and then a * subsequent call wants to start a write request. raid5_run_ops only * handles the case where compute block and postxor are requested * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ if ((s->req_compute || !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && (s->locked == 0 && (rcw == 0 || rmw == 0) && !test_bit(STRIPE_BIT_DELAY, &sh->state))) s->locked += handle_write_operations5(sh, rcw == 0, 0);}static void handle_issuing_new_write_requests6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks){ int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; int qd_idx = r6s->qd_idx; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Would I have to read this buffer for reconstruct_write */ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != pd_idx && i != qd_idx && (!test_bit(R5_LOCKED, &dev->flags) ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; else { pr_debug("raid6: must_compute: " "disk %d flags=%#lx\n", i, dev->flags); must_compute++; } } } pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", (unsigned long long)sh->sector, rcw, must_compute); set_bit(STRIPE_HANDLE, &sh->state); if (rcw > 0) /* want reconstruct write, but need to get some data */ for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (!test_bit(R5_OVERWRITE, &dev->flags) && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && test_bit(R5_Insync, &dev->flags)) { if ( test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { pr_debug("Read_old stripe %llu " "block %d for Reconstruct\n", (unsigned long long)sh->sector, i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; } else { pr_debug("Request delayed stripe %llu " "block %d for Reconstruct\n", (unsigned long long)sh->sector, i); set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } } } /* now if nothing is locked, and if we have enough data, we can start a * write request */ if (s->locked == 0 && rcw == 0 && !test_bit(STRIPE_BIT_DELAY, &sh->state)) { if (must_compute > 0) { /* We have failed blocks and need to compute them */ switch (s->failed) { case 0: BUG(); case 1: compute_block_1(sh, r6s->failed_num[0], 0); break; case 2: compute_block_2(sh, r6s->failed_num[0], r6s->failed_num[1]); break; default: /* This request should have been failed? */ BUG(); } } pr_debug("Com
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?