raid10.c
来自「linux 内核源代码」· C语言 代码 · 共 2,188 行 · 第 1/4 页
C
2,188 行
!test_bit(In_sync, &rdev->flags)) continue; /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later. */ if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { disk = ndisk; slot = nslot; break; } new_distance = abs(r10_bio->devs[nslot].addr - conf->mirrors[ndisk].head_position); if (new_distance < current_distance) { current_distance = new_distance; disk = ndisk; slot = nslot; } }rb_out: r10_bio->read_slot = slot;/* conf->next_seq_sect = this_sector + sectors;*/ if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) atomic_inc(&conf->mirrors[disk].rdev->nr_pending); else disk = -1; rcu_read_unlock(); return disk;}static void unplug_slaves(mddev_t *mddev){ conf_t *conf = mddev_to_conf(mddev); int i; rcu_read_lock(); for (i=0; i<mddev->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { struct request_queue *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); blk_unplug(r_queue); rdev_dec_pending(rdev, mddev); rcu_read_lock(); } } rcu_read_unlock();}static void raid10_unplug(struct request_queue *q){ mddev_t *mddev = q->queuedata; unplug_slaves(q->queuedata); md_wakeup_thread(mddev->thread);}static int raid10_congested(void *data, int bits){ mddev_t *mddev = data; conf_t *conf = mddev_to_conf(mddev); int i, ret = 0; rcu_read_lock(); for (i = 0; i < mddev->raid_disks && ret == 0; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } } rcu_read_unlock(); return ret;}/* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. * To do this we raise a 'barrier'. * The 'barrier' is a counter that can be raised multiple times * to count how many activities are happening which preclude * normal IO. * We can only raise the barrier if there is no pending IO. * i.e. if nr_pending == 0. * We choose only to raise the barrier if no-one is waiting for the * barrier to go down. This means that as soon as an IO request * is ready, no other operations which require a barrier will start * until the IO request has had a chance. * * So: regular IO calls 'wait_barrier'. When that returns there * is no backgroup IO happening, It must arrange to call * allow_barrier when it has finished its IO. * backgroup IO calls must call raise_barrier. Once that returns * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */#define RESYNC_DEPTH 32static void raise_barrier(conf_t *conf, int force){ BUG_ON(force && !conf->barrier); spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting (unless 'force') */ wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, conf->resync_lock, raid10_unplug(conf->mddev->queue)); /* block any new IO from starting */ conf->barrier++; /* No wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, conf->resync_lock, raid10_unplug(conf->mddev->queue)); spin_unlock_irq(&conf->resync_lock);}static void lower_barrier(conf_t *conf){ unsigned long flags; spin_lock_irqsave(&conf->resync_lock, flags); conf->barrier--; spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier);}static void wait_barrier(conf_t *conf){ spin_lock_irq(&conf->resync_lock); if (conf->barrier) { conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, !conf->barrier, conf->resync_lock, raid10_unplug(conf->mddev->queue)); conf->nr_waiting--; } conf->nr_pending++; spin_unlock_irq(&conf->resync_lock);}static void allow_barrier(conf_t *conf){ unsigned long flags; spin_lock_irqsave(&conf->resync_lock, flags); conf->nr_pending--; spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier);}static void freeze_array(conf_t *conf){ /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then * wait until barrier+nr_pending match nr_queued+2 */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, conf->barrier+conf->nr_pending == conf->nr_queued+2, conf->resync_lock, raid10_unplug(conf->mddev->queue)); spin_unlock_irq(&conf->resync_lock);}static void unfreeze_array(conf_t *conf){ /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); conf->barrier--; conf->nr_waiting--; wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock);}static int make_request(struct request_queue *q, struct bio * bio){ mddev_t *mddev = q->queuedata; conf_t *conf = mddev_to_conf(mddev); mirror_info_t *mirror; r10bio_t *r10_bio; struct bio *read_bio; int i; int chunk_sects = conf->chunk_mask + 1; const int rw = bio_data_dir(bio); const int do_sync = bio_sync(bio); struct bio_list bl; unsigned long flags; if (unlikely(bio_barrier(bio))) { bio_endio(bio, -EOPNOTSUPP); return 0; } /* If this request crosses a chunk boundary, we need to * split it. This will only happen for 1 PAGE (or less) requests. */ if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) > chunk_sects && conf->near_copies < conf->raid_disks)) { struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ if (bio->bi_vcnt != 1 || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); if (make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); if (make_request(q, &bp->bio2)) generic_make_request(&bp->bio2); bio_pair_release(bp); return 0; bad_map: printk("raid10_make_request bug: can't convert block across chunks" " or bigger than %dk %llu %d\n", chunk_sects/2, (unsigned long long)bio->bi_sector, bio->bi_size >> 10); bio_io_error(bio); return 0; } md_write_start(mddev, bio); /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ wait_barrier(conf); disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; r10_bio->sectors = bio->bi_size >> 9; r10_bio->mddev = mddev; r10_bio->sector = bio->bi_sector; r10_bio->state = 0; if (rw == READ) { /* * read balancing logic: */ int disk = read_balance(conf, r10_bio); int slot = r10_bio->read_slot; if (disk < 0) { raid_end_bio_io(r10_bio); return 0; } mirror = conf->mirrors + disk; read_bio = bio_clone(bio, GFP_NOIO); r10_bio->devs[slot].bio = read_bio; read_bio->bi_sector = r10_bio->devs[slot].addr + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio; generic_make_request(read_bio); return 0; } /* * WRITE: */ /* first select target devices under spinlock and * inc refcount on their rdev. Record them by setting * bios[x] to bio */ raid10_find_phys(conf, r10_bio); rcu_read_lock(); for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); r10_bio->devs[i].bio = bio; } else { r10_bio->devs[i].bio = NULL; set_bit(R10BIO_Degraded, &r10_bio->state); } } rcu_read_unlock(); atomic_set(&r10_bio->remaining, 0); bio_list_init(&bl); for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; if (!r10_bio->devs[i].bio) continue; mbio = bio_clone(bio, GFP_NOIO); r10_bio->devs[i].bio = mbio; mbio->bi_sector = r10_bio->devs[i].addr+ conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; mbio->bi_rw = WRITE | do_sync; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); bio_list_add(&bl, mbio); } if (unlikely(!atomic_read(&r10_bio->remaining))) { /* the array is dead */ md_write_end(mddev); raid_end_bio_io(r10_bio); return 0; } bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); spin_lock_irqsave(&conf->device_lock, flags); bio_list_merge(&conf->pending_bio_list, &bl); blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); if (do_sync) md_wakeup_thread(mddev->thread); return 0;}static void status(struct seq_file *seq, mddev_t *mddev){ conf_t *conf = mddev_to_conf(mddev); int i; if (conf->near_copies < conf->raid_disks) seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); if (conf->near_copies > 1) seq_printf(seq, " %d near-copies", conf->near_copies); if (conf->far_copies > 1) { if (conf->far_offset) seq_printf(seq, " %d offset-copies", conf->far_copies); else seq_printf(seq, " %d far-copies", conf->far_copies); } seq_printf(seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf(seq, "%s", conf->mirrors[i].rdev && test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); seq_printf(seq, "]");}static void error(mddev_t *mddev, mdk_rdev_t *rdev){ char b[BDEVNAME_SIZE]; conf_t *conf = mddev_to_conf(mddev); /* * If it is not operational, then we have already marked it as dead * else if it is the last working disks, ignore the error, let the * next level up know. * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) && conf->raid_disks-mddev->degraded == 1) /* * Don't fail the drive, just return an IO error. * The test should really be more sophisticated than * "working_disks == 1", but it isn't critical, and * can wait until we do more sophisticated "is the drive * really dead" tests... */ return; if (test_and_clear_bit(In_sync, &rdev->flags)) { unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);}static void print_conf(conf_t *conf){ int i; mirror_info_t *tmp; printk("RAID10 conf printout:\n"); if (!conf) { printk("(!conf)\n"); return; } printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->mirrors + i; if (tmp->rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev,b)); }}static void close_sync(conf_t *conf){ wait_barrier(conf); allow_barrier(conf); mempool_destroy(conf->r10buf_pool); conf->r10buf_pool = NULL;}/* check if there are enough drives for * every block to appear on atleast one */static int enough(conf_t *conf){ int first = 0; do { int n = conf->copies; int cnt = 0; while (n--) { if (conf->mirrors[first].rdev) cnt++; first = (first+1) % conf->raid_disks; } if (cnt == 0) return 0; } while (first != 0); return 1;}static int raid10_spare_active(mddev_t *mddev){ int i; conf_t *conf = mddev->private; mirror_info_t *tmp; /* * Find all non-in_sync disks within the RAID10 configuration * and mark them in_sync */ for (i = 0; i < conf->raid_disks; i++) { tmp = conf->mirrors + i; if (tmp->rdev && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; spin_unlock_irqrestore(&conf->device_lock, flags); } } print_conf(conf); return 0;}static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev){ conf_t *conf = mddev->private; int found = 0; int mirror; mirror_info_t *p; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ return 0; if (!enough(conf)) return 0; if (rdev->saved_raid_disk >= 0 && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; else mirror = 0; for ( ; mirror < mddev->raid_disks; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { blk_queue_stack_limits(mddev->queue, rdev->bdev->bd_disk->queue); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) mddev->queue->max_sectors = (PAGE_SIZE>>9); p->head_position = 0; rdev->raid_disk = mirror; found = 1; if (rdev->saved_raid_disk != mirror) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } print_conf(conf); return found;}static int raid10_remove_disk(mddev_t *mddev, int number){ conf_t *conf = mddev->private; int err = 0; mdk_rdev_t *rdev;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?