raid10.c
来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 1,781 行 · 第 1/4 页
C
1,781 行
/** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged * @q: request queue * @bio: the buffer head that's been built up so far * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset * If near_copies == raid_disk, there are no striping issues, * but in that case, the function isn't called at all. */static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *bio_vec){ mddev_t *mddev = q->queuedata; sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; unsigned int bio_sectors = bio->bi_size >> 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; /* bio_add cannot handle a negative return */ if (max <= bio_vec->bv_len && bio_sectors == 0) return bio_vec->bv_len; else return max;}/* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector * number - if this matches on the next IO then we use the last disk. * There is also a per-disk 'last know head position' sector that is * maintained from IRQ contexts, both the normal and the resync IO * completion handlers update this position correctly. If there is no * perfect sequential match then we pick the disk whose head is closest. * * If there are 2 mirrors in the same 2 devices, performance degrades * because position is mirror, not device based. * * The rdev for the device selected will have nr_pending incremented. *//* * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry. */static int read_balance(conf_t *conf, r10bio_t *r10_bio){ const unsigned long this_sector = r10_bio->sector; int disk, slot, nslot; const int sectors = r10_bio->sectors; sector_t new_distance, current_distance; raid10_find_phys(conf, r10_bio); spin_lock_irq(&conf->device_lock); /* * Check if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. * We take the first readable disk when above the resync window. */ if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) { /* make sure that disk is operational */ slot = 0; disk = r10_bio->devs[slot].devnum; while (!conf->mirrors[disk].rdev || !conf->mirrors[disk].rdev->in_sync) { slot++; if (slot == conf->copies) { slot = 0; disk = -1; break; } disk = r10_bio->devs[slot].devnum; } goto rb_out; } /* make sure the disk is operational */ slot = 0; disk = r10_bio->devs[slot].devnum; while (!conf->mirrors[disk].rdev || !conf->mirrors[disk].rdev->in_sync) { slot ++; if (slot == conf->copies) { disk = -1; goto rb_out; } disk = r10_bio->devs[slot].devnum; } current_distance = abs(this_sector - conf->mirrors[disk].head_position); /* Find the disk whose head is closest */ for (nslot = slot; nslot < conf->copies; nslot++) { int ndisk = r10_bio->devs[nslot].devnum; if (!conf->mirrors[ndisk].rdev || !conf->mirrors[ndisk].rdev->in_sync) continue; if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) { disk = ndisk; slot = nslot; break; } new_distance = abs(r10_bio->devs[nslot].addr - conf->mirrors[ndisk].head_position); if (new_distance < current_distance) { current_distance = new_distance; disk = ndisk; slot = nslot; } }rb_out: r10_bio->read_slot = slot;/* conf->next_seq_sect = this_sector + sectors;*/ if (disk >= 0 && conf->mirrors[disk].rdev) atomic_inc(&conf->mirrors[disk].rdev->nr_pending); spin_unlock_irq(&conf->device_lock); return disk;}static void unplug_slaves(mddev_t *mddev){ conf_t *conf = mddev_to_conf(mddev); int i; unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); for (i=0; i<mddev->raid_disks; i++) { mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); spin_unlock_irqrestore(&conf->device_lock, flags); if (r_queue->unplug_fn) r_queue->unplug_fn(r_queue); spin_lock_irqsave(&conf->device_lock, flags); atomic_dec(&rdev->nr_pending); } } spin_unlock_irqrestore(&conf->device_lock, flags);}static void raid10_unplug(request_queue_t *q){ unplug_slaves(q->queuedata);}static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, sector_t *error_sector){ mddev_t *mddev = q->queuedata; conf_t *conf = mddev_to_conf(mddev); unsigned long flags; int i, ret = 0; spin_lock_irqsave(&conf->device_lock, flags); for (i=0; i<mddev->raid_disks; i++) { mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && !rdev->faulty) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue = bdev_get_queue(bdev); if (r_queue->issue_flush_fn) { ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); if (ret) break; } } } spin_unlock_irqrestore(&conf->device_lock, flags); return ret;}/* * Throttle resync depth, so that we can both get proper overlapping of * requests, but are still able to handle normal requests quickly. */#define RESYNC_DEPTH 32static void device_barrier(conf_t *conf, sector_t sect){ spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock, unplug_slaves(conf->mddev)); if (!conf->barrier++) { wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, conf->resync_lock, unplug_slaves(conf->mddev)); if (conf->nr_pending) BUG(); } wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, conf->resync_lock, unplug_slaves(conf->mddev)); conf->next_resync = sect; spin_unlock_irq(&conf->resync_lock);}static int make_request(request_queue_t *q, struct bio * bio){ mddev_t *mddev = q->queuedata; conf_t *conf = mddev_to_conf(mddev); mirror_info_t *mirror; r10bio_t *r10_bio; struct bio *read_bio; int i; int chunk_sects = conf->chunk_mask + 1; /* If this request crosses a chunk boundary, we need to * split it. This will only happen for 1 PAGE (or less) requests. */ if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) > chunk_sects && conf->near_copies < conf->raid_disks)) { struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ if (bio->bi_vcnt != 1 || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); if (make_request(q, &bp->bio1)) generic_make_request(&bp->bio1); if (make_request(q, &bp->bio2)) generic_make_request(&bp->bio2); bio_pair_release(bp); return 0; bad_map: printk("raid10_make_request bug: can't convert block across chunks" " or bigger than %dk %llu %d\n", chunk_sects/2, (unsigned long long)bio->bi_sector, bio->bi_size >> 10); bio_io_error(bio, bio->bi_size); return 0; } /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); if (bio_data_dir(bio)==WRITE) { disk_stat_inc(mddev->gendisk, writes); disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); } else { disk_stat_inc(mddev->gendisk, reads); disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); } r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; r10_bio->sectors = bio->bi_size >> 9; r10_bio->mddev = mddev; r10_bio->sector = bio->bi_sector; if (bio_data_dir(bio) == READ) { /* * read balancing logic: */ int disk = read_balance(conf, r10_bio); int slot = r10_bio->read_slot; if (disk < 0) { raid_end_bio_io(r10_bio); return 0; } mirror = conf->mirrors + disk; read_bio = bio_clone(bio, GFP_NOIO); r10_bio->devs[slot].bio = read_bio; read_bio->bi_sector = r10_bio->devs[slot].addr + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; read_bio->bi_rw = READ; read_bio->bi_private = r10_bio; generic_make_request(read_bio); return 0; } /* * WRITE: */ /* first select target devices under spinlock and * inc refcount on their rdev. Record them by setting * bios[x] to bio */ raid10_find_phys(conf, r10_bio); spin_lock_irq(&conf->device_lock); for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; if (conf->mirrors[d].rdev && !conf->mirrors[d].rdev->faulty) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); r10_bio->devs[i].bio = bio; } else r10_bio->devs[i].bio = NULL; } spin_unlock_irq(&conf->device_lock); atomic_set(&r10_bio->remaining, 1); md_write_start(mddev); for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; if (!r10_bio->devs[i].bio) continue; mbio = bio_clone(bio, GFP_NOIO); r10_bio->devs[i].bio = mbio; mbio->bi_sector = r10_bio->devs[i].addr+ conf->mirrors[d].rdev->data_offset; mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; mbio->bi_rw = WRITE; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); generic_make_request(mbio); } if (atomic_dec_and_test(&r10_bio->remaining)) { md_write_end(mddev); raid_end_bio_io(r10_bio); } return 0;}static void status(struct seq_file *seq, mddev_t *mddev){ conf_t *conf = mddev_to_conf(mddev); int i; if (conf->near_copies < conf->raid_disks) seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); if (conf->near_copies > 1) seq_printf(seq, " %d near-copies", conf->near_copies); if (conf->far_copies > 1) seq_printf(seq, " %d far-copies", conf->far_copies); seq_printf(seq, " [%d/%d] [", conf->raid_disks, conf->working_disks); for (i = 0; i < conf->raid_disks; i++) seq_printf(seq, "%s", conf->mirrors[i].rdev && conf->mirrors[i].rdev->in_sync ? "U" : "_"); seq_printf(seq, "]");}static void error(mddev_t *mddev, mdk_rdev_t *rdev){ char b[BDEVNAME_SIZE]; conf_t *conf = mddev_to_conf(mddev); /* * If it is not operational, then we have already marked it as dead * else if it is the last working disks, ignore the error, let the * next level up know. * else mark the drive as failed */ if (rdev->in_sync && conf->working_disks == 1) /* * Don't fail the drive, just return an IO error. * The test should really be more sophisticated than * "working_disks == 1", but it isn't critical, and * can wait until we do more sophisticated "is the drive * really dead" tests... */ return; if (rdev->in_sync) { mddev->degraded++; conf->working_disks--; /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } rdev->in_sync = 0; rdev->faulty = 1; mddev->sb_dirty = 1; printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", bdevname(rdev->bdev,b), conf->working_disks);}static void print_conf(conf_t *conf){ int i; mirror_info_t *tmp; printk("RAID10 conf printout:\n"); if (!conf) { printk("(!conf)\n"); return; } printk(" --- wd:%d rd:%d\n", conf->working_disks, conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->mirrors + i; if (tmp->rdev) printk(" disk %d, wo:%d, o:%d, dev:%s\n", i, !tmp->rdev->in_sync, !tmp->rdev->faulty, bdevname(tmp->rdev->bdev,b)); }}static void close_sync(conf_t *conf){ spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, unplug_slaves(conf->mddev)); spin_unlock_irq(&conf->resync_lock); if (conf->barrier) BUG(); if (waitqueue_active(&conf->wait_idle)) BUG();
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?