raid1.c
来自「linux 内核源代码」· C语言 代码 · 共 2,200 行 · 第 1/4 页
C
2,200 行
blk_unplug(r_queue); rdev_dec_pending(rdev, mddev); rcu_read_lock(); } } rcu_read_unlock();}static void raid1_unplug(struct request_queue *q){ mddev_t *mddev = q->queuedata; unplug_slaves(mddev); md_wakeup_thread(mddev->thread);}static int raid1_congested(void *data, int bits){ mddev_t *mddev = data; conf_t *conf = mddev_to_conf(mddev); int i, ret = 0; rcu_read_lock(); for (i = 0; i < mddev->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); /* Note the '|| 1' - when read_balance prefers * non-congested targets, it can be removed */ if ((bits & (1<<BDI_write_congested)) || 1) ret |= bdi_congested(&q->backing_dev_info, bits); else ret &= bdi_congested(&q->backing_dev_info, bits); } } rcu_read_unlock(); return ret;}/* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. * To do this we raise a 'barrier'. * The 'barrier' is a counter that can be raised multiple times * to count how many activities are happening which preclude * normal IO. * We can only raise the barrier if there is no pending IO. * i.e. if nr_pending == 0. * We choose only to raise the barrier if no-one is waiting for the * barrier to go down. This means that as soon as an IO request * is ready, no other operations which require a barrier will start * until the IO request has had a chance. * * So: regular IO calls 'wait_barrier'. When that returns there * is no backgroup IO happening, It must arrange to call * allow_barrier when it has finished its IO. * backgroup IO calls must call raise_barrier. Once that returns * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */#define RESYNC_DEPTH 32static void raise_barrier(conf_t *conf){ spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, conf->resync_lock, raid1_unplug(conf->mddev->queue)); /* block any new IO from starting */ conf->barrier++; /* No wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, conf->resync_lock, raid1_unplug(conf->mddev->queue)); spin_unlock_irq(&conf->resync_lock);}static void lower_barrier(conf_t *conf){ unsigned long flags; spin_lock_irqsave(&conf->resync_lock, flags); conf->barrier--; spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier);}static void wait_barrier(conf_t *conf){ spin_lock_irq(&conf->resync_lock); if (conf->barrier) { conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, !conf->barrier, conf->resync_lock, raid1_unplug(conf->mddev->queue)); conf->nr_waiting--; } conf->nr_pending++; spin_unlock_irq(&conf->resync_lock);}static void allow_barrier(conf_t *conf){ unsigned long flags; spin_lock_irqsave(&conf->resync_lock, flags); conf->nr_pending--; spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier);}static void freeze_array(conf_t *conf){ /* stop syncio and normal IO and wait for everything to * go quite. * We increment barrier and nr_waiting, and then * wait until barrier+nr_pending match nr_queued+2 */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; wait_event_lock_irq(conf->wait_barrier, conf->barrier+conf->nr_pending == conf->nr_queued+2, conf->resync_lock, raid1_unplug(conf->mddev->queue)); spin_unlock_irq(&conf->resync_lock);}static void unfreeze_array(conf_t *conf){ /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); conf->barrier--; conf->nr_waiting--; wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock);}/* duplicate the data pages for behind I/O */static struct page **alloc_behind_pages(struct bio *bio){ int i; struct bio_vec *bvec; struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), GFP_NOIO); if (unlikely(!pages)) goto do_sync_io; bio_for_each_segment(bvec, bio, i) { pages[i] = alloc_page(GFP_NOIO); if (unlikely(!pages[i])) goto do_sync_io; memcpy(kmap(pages[i]) + bvec->bv_offset, kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); kunmap(pages[i]); kunmap(bvec->bv_page); } return pages;do_sync_io: if (pages) for (i = 0; i < bio->bi_vcnt && pages[i]; i++) put_page(pages[i]); kfree(pages); PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); return NULL;}static int make_request(struct request_queue *q, struct bio * bio){ mddev_t *mddev = q->queuedata; conf_t *conf = mddev_to_conf(mddev); mirror_info_t *mirror; r1bio_t *r1_bio; struct bio *read_bio; int i, targets = 0, disks; mdk_rdev_t *rdev; struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct bio_list bl; struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); const int do_sync = bio_sync(bio); int do_barriers; /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. * We test barriers_work *after* md_write_start as md_write_start * may cause the first superblock write, and that will check out * if barriers work. */ md_write_start(mddev, bio); /* wait on superblock update early */ if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { if (rw == WRITE) md_write_end(mddev); bio_endio(bio, -EOPNOTSUPP); return 0; } wait_barrier(conf); disk_stat_inc(mddev->gendisk, ios[rw]); disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); /* * make_request() can abort the operation when READA is being * used and no empty request is available. * */ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); r1_bio->master_bio = bio; r1_bio->sectors = bio->bi_size >> 9; r1_bio->state = 0; r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; if (rw == READ) { /* * read balancing logic: */ int rdisk = read_balance(conf, r1_bio); if (rdisk < 0) { /* couldn't find anywhere to read from */ raid_end_bio_io(r1_bio); return 0; } mirror = conf->mirrors + rdisk; r1_bio->read_disk = rdisk; read_bio = bio_clone(bio, GFP_NOIO); r1_bio->bios[rdisk] = read_bio; read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r1_bio; generic_make_request(read_bio); return 0; } /* * WRITE: */ /* first select target devices under spinlock and * inc refcount on their rdev. Record them by setting * bios[x] to bio */ disks = conf->raid_disks;#if 0 { static int first=1; if (first) printk("First Write sector %llu disks %d\n", (unsigned long long)r1_bio->sector, disks); first = 0; }#endif rcu_read_lock(); for (i = 0; i < disks; i++) { if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); if (test_bit(Faulty, &rdev->flags)) { rdev_dec_pending(rdev, mddev); r1_bio->bios[i] = NULL; } else r1_bio->bios[i] = bio; targets++; } else r1_bio->bios[i] = NULL; } rcu_read_unlock(); BUG_ON(targets == 0); /* we never fail the last device */ if (targets < conf->raid_disks) { /* array is degraded, we will not clear the bitmap * on I/O completion (see raid1_end_write_request) */ set_bit(R1BIO_Degraded, &r1_bio->state); } /* do behind I/O ? */ if (bitmap && atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && (behind_pages = alloc_behind_pages(bio)) != NULL) set_bit(R1BIO_BehindIO, &r1_bio->state); atomic_set(&r1_bio->remaining, 0); atomic_set(&r1_bio->behind_remaining, 0); do_barriers = bio_barrier(bio); if (do_barriers) set_bit(R1BIO_Barrier, &r1_bio->state); bio_list_init(&bl); for (i = 0; i < disks; i++) { struct bio *mbio; if (!r1_bio->bios[i]) continue; mbio = bio_clone(bio, GFP_NOIO); r1_bio->bios[i] = mbio; mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; mbio->bi_rw = WRITE | do_barriers | do_sync; mbio->bi_private = r1_bio; if (behind_pages) { struct bio_vec *bvec; int j; /* Yes, I really want the '__' version so that * we clear any unused pointer in the io_vec, rather * than leave them unchanged. This is important * because when we come to free the pages, we won't * know the originial bi_idx, so we just free * them all */ __bio_for_each_segment(bvec, mbio, j, 0) bvec->bv_page = behind_pages[j]; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } atomic_inc(&r1_bio->remaining); bio_list_add(&bl, mbio); } kfree(behind_pages); /* the behind pages are attached to the bios now */ bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, test_bit(R1BIO_BehindIO, &r1_bio->state)); spin_lock_irqsave(&conf->device_lock, flags); bio_list_merge(&conf->pending_bio_list, &bl); bio_list_init(&bl); blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); if (do_sync) md_wakeup_thread(mddev->thread);#if 0 while ((bio = bio_list_pop(&bl)) != NULL) generic_make_request(bio);#endif return 0;}static void status(struct seq_file *seq, mddev_t *mddev){ conf_t *conf = mddev_to_conf(mddev); int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); } rcu_read_unlock(); seq_printf(seq, "]");}static void error(mddev_t *mddev, mdk_rdev_t *rdev){ char b[BDEVNAME_SIZE]; conf_t *conf = mddev_to_conf(mddev); /* * If it is not operational, then we have already marked it as dead * else if it is the last working disks, ignore the error, let the * next level up know. * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) && (conf->raid_disks - mddev->degraded) == 1) /* * Don't fail the drive, act as though we were just a * normal single drive */ return; if (test_and_clear_bit(In_sync, &rdev->flags)) { unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; set_bit(Faulty, &rdev->flags); spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);}static void print_conf(conf_t *conf){ int i; printk("RAID1 conf printout:\n"); if (!conf) { printk("(!conf)\n"); return; } printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", i, !test_bit(In_sync, &rdev->flags), !test_bit(Faulty, &rdev->flags), bdevname(rdev->bdev,b)); } rcu_read_unlock();}static void close_sync(conf_t *conf){ wait_barrier(conf); allow_barrier(conf); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL;}static int raid1_spare_active(mddev_t *mddev){ int i; conf_t *conf = mddev->private; /* * Find all failed disks within the RAID1 configuration * and mark them readable. * Called under mddev lock, so rcu protection not needed. */ for (i = 0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; spin_unlock_irqrestore(&conf->device_lock, flags); } } print_conf(conf); return 0;}static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev){ conf_t *conf = mddev->private; int found = 0; int mirror = 0; mirror_info_t *p; for (mirror=0; mirror < mddev->raid_disks; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { blk_queue_stack_limits(mddev->queue, rdev->bdev->bd_disk->queue); /* as we don't honour merge_bvec_fn, we must never risk * violating it, so limit ->max_sector to one PAGE, as * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && mddev->queue->max_sectors > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); p->head_position = 0; rdev->raid_disk = mirror; found = 1; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ if (rdev->saved_raid_disk < 0) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } print_conf(conf); return found;}static int raid1_remove_disk(mddev_t *mddev, int number){ conf_t *conf = mddev->private; int err = 0; mdk_rdev_t *rdev; mirror_info_t *p = conf->mirrors+ number; print_conf(conf); rdev = p->rdev; if (rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; goto abort; } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; } }abort: print_conf(conf); return err;}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?