raid5.c
来自「Linux Kernel 2.6.9 for OMAP1710」· C语言 代码 · 共 1,920 行 · 第 1/4 页
C
1,920 行
bi = sh->dev[i].written; sh->dev[i].written = NULL; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); if (--bi->bi_phys_segments == 0) { md_write_end(conf->mddev); bi->bi_next = return_bi; return_bi = bi; } bi = bi2; } /* fail any reads if this device is non-operational */ if (!test_bit(R5_Insync, &sh->dev[i].flags)) { bi = sh->dev[i].toread; sh->dev[i].toread = NULL; if (bi) to_read--; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); if (--bi->bi_phys_segments == 0) { bi->bi_next = return_bi; return_bi = bi; } bi = nextbi; } } } spin_unlock_irq(&conf->device_lock); } if (failed > 1 && syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); syncing = 0; } /* might be able to return some write requests if the parity block * is safe, or on a failed drive */ dev = &sh->dev[sh->pd_idx]; if ( written && ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) || (failed == 1 && failed_num == sh->pd_idx)) ) { /* any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * never LOCKED, so we don't need to test 'failed' directly. */ for (i=disks; i--; ) if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags) ) { /* We can return any write requests */ struct bio *wbi, *wbi2; PRINTK("Return write for disc %d\n", i); spin_lock_irq(&conf->device_lock); wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); if (--wbi->bi_phys_segments == 0) { md_write_end(conf->mddev); wbi->bi_next = return_bi; return_bi = wbi; } wbi = wbi2; } spin_unlock_irq(&conf->device_lock); } } } /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. */ if (to_read || non_overwrite || (syncing && (uptodate < disks))) { for (i=disks; i--;) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || syncing || (failed && (sh->dev[failed_num].toread || (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) ) ) { /* we would like to get this block, possibly * by computing it, but we might not be able to */ if (uptodate == disks-1) { PRINTK("Computing block %d\n", i); compute_block(sh, i); uptodate++; } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags);#if 0 /* if I am just reading this block and we don't have a failed drive, or any pending writes then sidestep the cache */ if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && ! syncing && !failed && !to_write) { sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; }#endif locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); if (syncing) md_sync_acct(conf->disks[i].rdev->bdev, STRIPE_SECTORS); } } } set_bit(STRIPE_HANDLE, &sh->state); } /* now to consider writing and what else, if anything should be read */ if (to_write) { int rmw=0, rcw=0; for (i=disks ; i--;) { /* would I have to read this buffer for read_modify_write */ dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && (!test_bit(R5_LOCKED, &dev->flags) #if 0|| sh->bh_page[i]!=bh->b_page#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)/* && !(!mddev->insync && i == sh->pd_idx) */ ) rmw++; else rmw += 2*disks; /* cannot read it */ } /* Would I have to read this buffer for reconstruct_write */ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && (!test_bit(R5_LOCKED, &dev->flags) #if 0|| sh->bh_page[i] != bh->b_page#endif ) && !test_bit(R5_UPTODATE, &dev->flags)) { if (test_bit(R5_Insync, &dev->flags)) rcw++; else rcw += 2*disks; } } PRINTK("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); if (rmw < rcw && rmw > 0) /* prefer read-modify-write, but need to get some data */ for (i=disks; i--;) { dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && test_bit(R5_Insync, &dev->flags)) { if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { PRINTK("Read_old block %d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } } } if (rcw <= rmw && rcw > 0) /* want reconstruct write, but need to get some data */ for (i=disks; i--;) { dev = &sh->dev[i]; if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && test_bit(R5_Insync, &dev->flags)) { if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { PRINTK("Read_old block %d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } } } /* now if nothing is locked, and if we have enough data, we can start a write request */ if (locked == 0 && (rcw == 0 ||rmw == 0)) { PRINTK("Computing parity...\n"); compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); /* now every locked buffer is ready to be written */ for (i=disks; i--;) if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { PRINTK("Writing block %d\n", i); locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags); if (!test_bit(R5_Insync, &sh->dev[i].flags) || (i==sh->pd_idx && failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); } if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); } } } /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough data * is available */ if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) { set_bit(STRIPE_HANDLE, &sh->state); if (failed == 0) { char *pagea; if (uptodate != disks) BUG(); compute_parity(sh, CHECK_PARITY); uptodate--; pagea = page_address(sh->dev[sh->pd_idx].page); if ((*(u32*)pagea) == 0 && !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) { /* parity is correct (on disc, not in buffer any more) */ set_bit(STRIPE_INSYNC, &sh->state); } } if (!test_bit(STRIPE_INSYNC, &sh->state)) { if (failed==0) failed_num = sh->pd_idx; /* should be able to compute the missing block and write it to spare */ if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) { if (uptodate+1 != disks) BUG(); compute_block(sh, failed_num); uptodate++; } if (uptodate != disks) BUG(); dev = &sh->dev[failed_num]; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); locked++; set_bit(STRIPE_INSYNC, &sh->state); set_bit(R5_Syncio, &dev->flags); } } if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS,1); clear_bit(STRIPE_SYNCING, &sh->state); } spin_unlock(&sh->lock); while ((bi=return_bi)) { int bytes = bi->bi_size; return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; bi->bi_end_io(bi, bytes, 0); } for (i=disks; i-- ;) { int rw; struct bio *bi; mdk_rdev_t *rdev; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) rw = 1; else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = 0; else continue; bi = &sh->dev[i].req; bi->bi_rw = rw; if (rw) bi->bi_end_io = raid5_end_write_request; else bi->bi_end_io = raid5_end_read_request; spin_lock_irq(&conf->device_lock); rdev = conf->disks[i].rdev; if (rdev && rdev->faulty) rdev = NULL; if (rdev) atomic_inc(&rdev->nr_pending); spin_unlock_irq(&conf->device_lock); if (rdev) { if (test_bit(R5_Syncio, &sh->dev[i].flags)) md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; PRINTK("for %llu schedule op %ld on disc %d\n", (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); bi->bi_sector = sh->sector + rdev->data_offset; bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_vcnt = 1; bi->bi_idx = 0; bi->bi_io_vec = &sh->dev[i].vec; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; bi->bi_next = NULL; generic_make_request(bi); } else { PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); } }}static inline void raid5_activate_delayed(raid5_conf_t *conf){ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { while (!list_empty(&conf->delayed_list)) { struct list_head *l = conf->delayed_list.next; struct stripe_head *sh; sh = list_entry(l, struct stripe_head, lru); list_del_init(l); clear_bit(STRIPE_DELAYED, &sh->state); if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); list_add_tail(&sh->lru, &conf->handle_list); } }}static void unplug_slaves(mddev_t *mddev){ raid5_conf_t *conf = mddev_to_conf(mddev); int i; unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); for (i=0; i<mddev->raid_disks; i++) { mdk_rdev_t *rdev = conf->disks[i].rdev; if (rdev && atomic_read(&rdev->nr_pending)) { request_queue_t *r_queue = bdev_get_queue(rdev->bdev); atomic_inc(&rdev->nr_pending); spin_unlock_irqrestore(&conf->device_lock, flags); if (r_queue && r_queue->unplug_fn) r_queue->unplug_fn(r_queue); spin_lock_irqsave(&conf->device_lock, flags); atomic_dec(&rdev->nr_pending); } } spin_unlock_irqrestore(&conf->device_lock, flags);}static void raid5_unplug_device(request_queue_t *q){ mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); if (blk_remove_plug(q)) raid5_activate_delayed(conf); md_wakeup_thread(mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); unplug_slaves(mddev);}static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk, sector_t *error_sector){ mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); int i, ret = 0; for (i=0; i<mddev->raid_disks; i++) { mdk_rdev_t *rdev = conf->disks[i].rdev; if (rdev && !rdev->faulty) { struct block_device *bdev = rdev->bdev; request_queue_t *r_queue; if (!bdev) continue; r_queue = bdev_get_queue(bdev); if (!r_queue) continue; if (!r_queue->issue_flush_fn) { ret = -EOPNOTSUPP; break; } ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector); if (ret) break; } } return ret;}static inline void raid5_plug_device(raid5_conf_t *conf){ spin_lock_irq(&conf->device_lock); blk_plug_device(conf->mddev->queue); spin_unlock_irq(&conf->device_lock);}static int make_request (request_queue_t *q, struct bio * bi){ mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev_to_conf(mddev); const unsigned int raid_disks = conf->raid_disks; const unsigned int data_disks = raid_disks - 1; unsigned int dd_idx, pd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; if (bio_data_dir(bi)==WRITE) { disk_stat_inc(mddev->gendisk, writes); disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); } else { disk_stat_inc(mddev->gendisk, reads); disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi)); } logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ if ( bio_data_dir(bi) == WRITE ) md_write_start(mddev); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { new_sector = raid5_compute_sector(logical_sector, raid_disks, data_disks, &dd_idx, &pd_idx, conf); PRINTK("raid5: make_request, sector %Lu logical %Lu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK)); if (sh) { add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK)); raid5_plug_device(conf); handle_stripe(sh); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ clear_bit(BIO_UPTODATE, &bi->bi_flags); break; } } spin_lock_irq(&conf->device_lock); if (--bi->bi_phys_segments == 0) { int bytes = bi->bi_size; if ( bio_data_dir(bi) == WRITE ) md_write_end(mddev); bi->bi_size = 0; bi->bi_end_io(bi, bytes, 0); }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?