raid5.c

来自「linux 内核源代码」· C语言 代码 · 共 2,325 行 · 第 1/5 页

C
2,325
字号
}static voidhandle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,				struct stripe_head_state *s, int disks,				struct bio **return_bi){	int i;	for (i = disks; i--; ) {		struct bio *bi;		int bitmap_end = 0;		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {			mdk_rdev_t *rdev;			rcu_read_lock();			rdev = rcu_dereference(conf->disks[i].rdev);			if (rdev && test_bit(In_sync, &rdev->flags))				/* multiple read failures in one stripe */				md_error(conf->mddev, rdev);			rcu_read_unlock();		}		spin_lock_irq(&conf->device_lock);		/* fail all writes first */		bi = sh->dev[i].towrite;		sh->dev[i].towrite = NULL;		if (bi) {			s->to_write--;			bitmap_end = 1;		}		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))			wake_up(&conf->wait_for_overlap);		while (bi && bi->bi_sector <			sh->dev[i].sector + STRIPE_SECTORS) {			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);			clear_bit(BIO_UPTODATE, &bi->bi_flags);			if (--bi->bi_phys_segments == 0) {				md_write_end(conf->mddev);				bi->bi_next = *return_bi;				*return_bi = bi;			}			bi = nextbi;		}		/* and fail all 'written' */		bi = sh->dev[i].written;		sh->dev[i].written = NULL;		if (bi) bitmap_end = 1;		while (bi && bi->bi_sector <		       sh->dev[i].sector + STRIPE_SECTORS) {			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);			clear_bit(BIO_UPTODATE, &bi->bi_flags);			if (--bi->bi_phys_segments == 0) {				md_write_end(conf->mddev);				bi->bi_next = *return_bi;				*return_bi = bi;			}			bi = bi2;		}		/* fail any reads if this device is non-operational and		 * the data has not reached the cache yet.		 */		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||		      test_bit(R5_ReadError, &sh->dev[i].flags))) {			bi = sh->dev[i].toread;			sh->dev[i].toread = NULL;			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))				wake_up(&conf->wait_for_overlap);			if (bi) s->to_read--;			while (bi && bi->bi_sector <			       sh->dev[i].sector + STRIPE_SECTORS) {				struct bio *nextbi =					r5_next_bio(bi, sh->dev[i].sector);				clear_bit(BIO_UPTODATE, &bi->bi_flags);				if (--bi->bi_phys_segments == 0) {					bi->bi_next = *return_bi;					*return_bi = bi;				}				bi = nextbi;			}		}		spin_unlock_irq(&conf->device_lock);		if (bitmap_end)			bitmap_endwrite(conf->mddev->bitmap, sh->sector,					STRIPE_SECTORS, 0, 0);	}}/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks * to process */static int __handle_issuing_new_read_requests5(struct stripe_head *sh,			struct stripe_head_state *s, int disk_idx, int disks){	struct r5dev *dev = &sh->dev[disk_idx];	struct r5dev *failed_dev = &sh->dev[s->failed_num];	/* don't schedule compute operations or reads on the parity block while	 * a check is in flight	 */	if ((disk_idx == sh->pd_idx) &&	     test_bit(STRIPE_OP_CHECK, &sh->ops.pending))		return ~0;	/* is the data in this block needed, and can we get it? */	if (!test_bit(R5_LOCKED, &dev->flags) &&	    !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||	    (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||	     s->syncing || s->expanding || (s->failed &&	     (failed_dev->toread || (failed_dev->towrite &&	     !test_bit(R5_OVERWRITE, &failed_dev->flags)	     ))))) {		/* 1/ We would like to get this block, possibly by computing it,		 * but we might not be able to.		 *		 * 2/ Since parity check operations potentially make the parity		 * block !uptodate it will need to be refreshed before any		 * compute operations on data disks are scheduled.		 *		 * 3/ We hold off parity block re-reads until check operations		 * have quiesced.		 */		if ((s->uptodate == disks - 1) &&		    !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {			set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);			set_bit(R5_Wantcompute, &dev->flags);			sh->ops.target = disk_idx;			s->req_compute = 1;			sh->ops.count++;			/* Careful: from this point on 'uptodate' is in the eye			 * of raid5_run_ops which services 'compute' operations			 * before writes. R5_Wantcompute flags a block that will			 * be R5_UPTODATE by the time it is needed for a			 * subsequent operation.			 */			s->uptodate++;			return 0; /* uptodate + compute == disks */		} else if ((s->uptodate < disks - 1) &&			test_bit(R5_Insync, &dev->flags)) {			/* Note: we hold off compute operations while checks are			 * in flight, but we still prefer 'compute' over 'read'			 * hence we only read if (uptodate < * disks-1)			 */			set_bit(R5_LOCKED, &dev->flags);			set_bit(R5_Wantread, &dev->flags);			if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))				sh->ops.count++;			s->locked++;			pr_debug("Reading block %d (sync=%d)\n", disk_idx,				s->syncing);		}	}	return ~0;}static void handle_issuing_new_read_requests5(struct stripe_head *sh,			struct stripe_head_state *s, int disks){	int i;	/* Clear completed compute operations.  Parity recovery	 * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled	 * later on in this routine	 */	if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&		!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);		clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);	}	/* look for blocks to read/compute, skip this if a compute	 * is already in flight, or if the stripe contents are in the	 * midst of changing due to a write	 */	if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&		!test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&		!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {		for (i = disks; i--; )			if (__handle_issuing_new_read_requests5(				sh, s, i, disks) == 0)				break;	}	set_bit(STRIPE_HANDLE, &sh->state);}static void handle_issuing_new_read_requests6(struct stripe_head *sh,			struct stripe_head_state *s, struct r6_state *r6s,			int disks){	int i;	for (i = disks; i--; ) {		struct r5dev *dev = &sh->dev[i];		if (!test_bit(R5_LOCKED, &dev->flags) &&		    !test_bit(R5_UPTODATE, &dev->flags) &&		    (dev->toread || (dev->towrite &&		     !test_bit(R5_OVERWRITE, &dev->flags)) ||		     s->syncing || s->expanding ||		     (s->failed >= 1 &&		      (sh->dev[r6s->failed_num[0]].toread ||		       s->to_write)) ||		     (s->failed >= 2 &&		      (sh->dev[r6s->failed_num[1]].toread ||		       s->to_write)))) {			/* we would like to get this block, possibly			 * by computing it, but we might not be able to			 */			if (s->uptodate == disks-1) {				pr_debug("Computing stripe %llu block %d\n",				       (unsigned long long)sh->sector, i);				compute_block_1(sh, i, 0);				s->uptodate++;			} else if ( s->uptodate == disks-2 && s->failed >= 2 ) {				/* Computing 2-failure is *very* expensive; only				 * do it if failed >= 2				 */				int other;				for (other = disks; other--; ) {					if (other == i)						continue;					if (!test_bit(R5_UPTODATE,					      &sh->dev[other].flags))						break;				}				BUG_ON(other < 0);				pr_debug("Computing stripe %llu blocks %d,%d\n",				       (unsigned long long)sh->sector,				       i, other);				compute_block_2(sh, i, other);				s->uptodate += 2;			} else if (test_bit(R5_Insync, &dev->flags)) {				set_bit(R5_LOCKED, &dev->flags);				set_bit(R5_Wantread, &dev->flags);				s->locked++;				pr_debug("Reading block %d (sync=%d)\n",					i, s->syncing);			}		}	}	set_bit(STRIPE_HANDLE, &sh->state);}/* handle_completed_write_requests * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * never LOCKED, so we don't need to test 'failed' directly. */static void handle_completed_write_requests(raid5_conf_t *conf,	struct stripe_head *sh, int disks, struct bio **return_bi){	int i;	struct r5dev *dev;	for (i = disks; i--; )		if (sh->dev[i].written) {			dev = &sh->dev[i];			if (!test_bit(R5_LOCKED, &dev->flags) &&				test_bit(R5_UPTODATE, &dev->flags)) {				/* We can return any write requests */				struct bio *wbi, *wbi2;				int bitmap_end = 0;				pr_debug("Return write for disc %d\n", i);				spin_lock_irq(&conf->device_lock);				wbi = dev->written;				dev->written = NULL;				while (wbi && wbi->bi_sector <					dev->sector + STRIPE_SECTORS) {					wbi2 = r5_next_bio(wbi, dev->sector);					if (--wbi->bi_phys_segments == 0) {						md_write_end(conf->mddev);						wbi->bi_next = *return_bi;						*return_bi = wbi;					}					wbi = wbi2;				}				if (dev->towrite == NULL)					bitmap_end = 1;				spin_unlock_irq(&conf->device_lock);				if (bitmap_end)					bitmap_endwrite(conf->mddev->bitmap,							sh->sector,							STRIPE_SECTORS,					 !test_bit(STRIPE_DEGRADED, &sh->state),							0);			}		}}static void handle_issuing_new_write_requests5(raid5_conf_t *conf,		struct stripe_head *sh,	struct stripe_head_state *s, int disks){	int rmw = 0, rcw = 0, i;	for (i = disks; i--; ) {		/* would I have to read this buffer for read_modify_write */		struct r5dev *dev = &sh->dev[i];		if ((dev->towrite || i == sh->pd_idx) &&		    !test_bit(R5_LOCKED, &dev->flags) &&		    !(test_bit(R5_UPTODATE, &dev->flags) ||		      test_bit(R5_Wantcompute, &dev->flags))) {			if (test_bit(R5_Insync, &dev->flags))				rmw++;			else				rmw += 2*disks;  /* cannot read it */		}		/* Would I have to read this buffer for reconstruct_write */		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&		    !test_bit(R5_LOCKED, &dev->flags) &&		    !(test_bit(R5_UPTODATE, &dev->flags) ||		    test_bit(R5_Wantcompute, &dev->flags))) {			if (test_bit(R5_Insync, &dev->flags)) rcw++;			else				rcw += 2*disks;		}	}	pr_debug("for sector %llu, rmw=%d rcw=%d\n",		(unsigned long long)sh->sector, rmw, rcw);	set_bit(STRIPE_HANDLE, &sh->state);	if (rmw < rcw && rmw > 0)		/* prefer read-modify-write, but need to get some data */		for (i = disks; i--; ) {			struct r5dev *dev = &sh->dev[i];			if ((dev->towrite || i == sh->pd_idx) &&			    !test_bit(R5_LOCKED, &dev->flags) &&			    !(test_bit(R5_UPTODATE, &dev->flags) ||			    test_bit(R5_Wantcompute, &dev->flags)) &&			    test_bit(R5_Insync, &dev->flags)) {				if (				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {					pr_debug("Read_old block "						"%d for r-m-w\n", i);					set_bit(R5_LOCKED, &dev->flags);					set_bit(R5_Wantread, &dev->flags);					if (!test_and_set_bit(						STRIPE_OP_IO, &sh->ops.pending))						sh->ops.count++;					s->locked++;				} else {					set_bit(STRIPE_DELAYED, &sh->state);					set_bit(STRIPE_HANDLE, &sh->state);				}			}		}	if (rcw <= rmw && rcw > 0)		/* want reconstruct write, but need to get some data */		for (i = disks; i--; ) {			struct r5dev *dev = &sh->dev[i];			if (!test_bit(R5_OVERWRITE, &dev->flags) &&			    i != sh->pd_idx &&			    !test_bit(R5_LOCKED, &dev->flags) &&			    !(test_bit(R5_UPTODATE, &dev->flags) ||			    test_bit(R5_Wantcompute, &dev->flags)) &&			    test_bit(R5_Insync, &dev->flags)) {				if (				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {					pr_debug("Read_old block "						"%d for Reconstruct\n", i);					set_bit(R5_LOCKED, &dev->flags);					set_bit(R5_Wantread, &dev->flags);					if (!test_and_set_bit(						STRIPE_OP_IO, &sh->ops.pending))						sh->ops.count++;					s->locked++;				} else {					set_bit(STRIPE_DELAYED, &sh->state);					set_bit(STRIPE_HANDLE, &sh->state);				}			}		}	/* now if nothing is locked, and if we have enough data,	 * we can start a write request	 */	/* since handle_stripe can be called at any time we need to handle the	 * case where a compute block operation has been submitted and then a	 * subsequent call wants to start a write request.  raid5_run_ops only	 * handles the case where compute block and postxor are requested	 * simultaneously.  If this is not the case then new writes need to be	 * held off until the compute completes.	 */	if ((s->req_compute ||	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&		(s->locked == 0 && (rcw == 0 || rmw == 0) &&		!test_bit(STRIPE_BIT_DELAY, &sh->state)))		s->locked += handle_write_operations5(sh, rcw == 0, 0);}static void handle_issuing_new_write_requests6(raid5_conf_t *conf,		struct stripe_head *sh,	struct stripe_head_state *s,		struct r6_state *r6s, int disks){	int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;	int qd_idx = r6s->qd_idx;	for (i = disks; i--; ) {		struct r5dev *dev = &sh->dev[i];		/* Would I have to read this buffer for reconstruct_write */		if (!test_bit(R5_OVERWRITE, &dev->flags)		    && i != pd_idx && i != qd_idx		    && (!test_bit(R5_LOCKED, &dev->flags)			    ) &&		    !test_bit(R5_UPTODATE, &dev->flags)) {			if (test_bit(R5_Insync, &dev->flags)) rcw++;			else {				pr_debug("raid6: must_compute: "					"disk %d flags=%#lx\n", i, dev->flags);				must_compute++;			}		}	}	pr_debug("for sector %llu, rcw=%d, must_compute=%d\n",	       (unsigned long long)sh->sector, rcw, must_compute);	set_bit(STRIPE_HANDLE, &sh->state);	if (rcw > 0)		/* want reconstruct write, but need to get some data */		for (i = disks; i--; ) {			struct r5dev *dev = &sh->dev[i];			if (!test_bit(R5_OVERWRITE, &dev->flags)			    && !(s->failed == 0 && (i == pd_idx || i == qd_idx))			    && !test_bit(R5_LOCKED, &dev->flags) &&			    !test_bit(R5_UPTODATE, &dev->flags) &&			    test_bit(R5_Insync, &dev->flags)) {				if (				  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {					pr_debug("Read_old stripe %llu "						"block %d for Reconstruct\n",					     (unsigned long long)sh->sector, i);					set_bit(R5_LOCKED, &dev->flags);					set_bit(R5_Wantread, &dev->flags);					s->locked++;				} else {					pr_debug("Request delayed stripe %llu "						"block %d for Reconstruct\n",					     (unsigned long long)sh->sector, i);					set_bit(STRIPE_DELAYED, &sh->state);					set_bit(STRIPE_HANDLE, &sh->state);				}			}		}	/* now if nothing is locked, and if we have enough data, we can start a	 * write request	 */	if (s->locked == 0 && rcw == 0 &&	    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {		if (must_compute > 0) {			/* We have failed blocks and need to compute them */			switch (s->failed) {			case 0:				BUG();			case 1:				compute_block_1(sh, r6s->failed_num[0], 0);				break;			case 2:				compute_block_2(sh, r6s->failed_num[0],						r6s->failed_num[1]);				break;			default: /* This request should have been failed? */				BUG();			}		}		pr_debug("Com

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?