📄 raid5.c
字号:
locked++; } } /* now if nothing is locked, and if we have enough data, we can start a write request */ if (locked == 0 && (rcw == 0 ||rmw == 0)) { PRINTK("Computing parity...\n"); compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); /* now every locked buffer is ready to be written */ for (i=disks; i--;) if (buffer_locked(sh->bh_cache[i])) { PRINTK("Writing block %d\n", i); locked++; action[i] = WRITE+1; if (!conf->disks[i].operational || (i==sh->pd_idx && failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); } } } /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough data * is available */ if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) { set_bit(STRIPE_HANDLE, &sh->state); if (failed == 0) { if (uptodate != disks) BUG(); compute_parity(sh, CHECK_PARITY); uptodate--; bh = sh->bh_cache[sh->pd_idx]; if ((*(u32*)bh->b_data) == 0 && !memcmp(bh->b_data, bh->b_data+4, bh->b_size-4)) { /* parity is correct (on disc, not in buffer any more) */ set_bit(STRIPE_INSYNC, &sh->state); } } if (!test_bit(STRIPE_INSYNC, &sh->state)) { if (failed==0) failed_num = sh->pd_idx; /* should be able to compute the missing block and write it to spare */ if (!buffer_uptodate(sh->bh_cache[failed_num])) { if (uptodate+1 != disks) BUG(); compute_block(sh, failed_num); uptodate++; } if (uptodate != disks) BUG(); bh = sh->bh_cache[failed_num]; set_bit(BH_Lock, &bh->b_state); action[failed_num] = WRITE+1; locked++; set_bit(STRIPE_INSYNC, &sh->state); if (conf->disks[i].operational) md_sync_acct(conf->disks[i].dev, bh->b_size>>9); else if (conf->spare) md_sync_acct(conf->spare->dev, bh->b_size>>9); } } if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, (sh->size>>10) - sh->sync_redone,1); clear_bit(STRIPE_SYNCING, &sh->state); } spin_unlock(&sh->lock); while ((bh=return_ok)) { return_ok = bh->b_reqnext; bh->b_reqnext = NULL; bh->b_end_io(bh, 1); } while ((bh=return_fail)) { return_ok = bh->b_reqnext; bh->b_reqnext = NULL; bh->b_end_io(bh, 0); } for (i=disks; i-- ;) if (action[i]) { struct buffer_head *bh = sh->bh_cache[i]; int skip = 0; if (action[i] == READ+1) bh->b_end_io = raid5_end_read_request; else bh->b_end_io = raid5_end_write_request; if (conf->disks[i].operational) bh->b_dev = conf->disks[i].dev; else if (conf->spare && action[i] == WRITE+1) bh->b_dev = conf->spare->dev; else if (action[i] == READ+1) BUG(); else skip=1; if (!skip) { PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i); atomic_inc(&sh->count); bh->b_rdev = bh->b_dev; bh->b_rsector = bh->b_blocknr * (bh->b_size>>9); generic_make_request(action[i]-1, bh); } else { PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector); clear_bit(BH_Lock, &bh->b_state); set_bit(STRIPE_HANDLE, &sh->state); } }}static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh){ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; const unsigned int raid_disks = conf->raid_disks; const unsigned int data_disks = raid_disks - 1; unsigned int dd_idx, pd_idx; unsigned long new_sector; int read_ahead = 0; struct stripe_head *sh; if (rw == READA) { rw = READ; read_ahead=1; } new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks, &dd_idx, &pd_idx, conf); PRINTK("raid5_make_request, sector %lu\n", new_sector); sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead); if (sh) { sh->pd_idx = pd_idx; add_stripe_bh(sh, bh, dd_idx, rw); handle_stripe(sh); release_stripe(sh); } else bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); return 0;}/* * Determine correct block size for this device. */unsigned int device_bsize (kdev_t dev){ unsigned int i, correct_size; correct_size = BLOCK_SIZE; if (blksize_size[MAJOR(dev)]) { i = blksize_size[MAJOR(dev)][MINOR(dev)]; if (i) correct_size = i; } return correct_size;}static int raid5_sync_request (mddev_t *mddev, unsigned long block_nr){ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; int sectors_per_chunk = conf->chunk_size >> 9; unsigned long stripe = (block_nr<<1)/sectors_per_chunk; int chunk_offset = (block_nr<<1) % sectors_per_chunk; int dd_idx, pd_idx; unsigned long first_sector; int raid_disks = conf->raid_disks; int data_disks = raid_disks-1; int redone = 0; int bufsize; sh = get_active_stripe(conf, block_nr<<1, 0, 0); bufsize = sh->size; redone = block_nr-(sh->sector>>1); first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); sh->pd_idx = pd_idx; spin_lock(&sh->lock); set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); sh->sync_redone = redone; spin_unlock(&sh->lock); handle_stripe(sh); release_stripe(sh); return (bufsize>>10)-redone;}/* * This is our raid5 kernel thread. * * We scan the hash table for stripes which can be handled now. * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */static void raid5d (void *data){ struct stripe_head *sh; raid5_conf_t *conf = data; mddev_t *mddev = conf->mddev; int handled; PRINTK("+++ raid5d active\n"); handled = 0; if (mddev->sb_dirty) { mddev->sb_dirty = 0; md_update_sb(mddev); } md_spin_lock_irq(&conf->device_lock); while (!list_empty(&conf->handle_list)) { struct list_head *first = conf->handle_list.next; sh = list_entry(first, struct stripe_head, lru); list_del_init(first); atomic_inc(&sh->count); if (atomic_read(&sh->count)!= 1) BUG(); md_spin_unlock_irq(&conf->device_lock); handled++; handle_stripe(sh); release_stripe(sh); md_spin_lock_irq(&conf->device_lock); } PRINTK("%d stripes handled\n", handled); md_spin_unlock_irq(&conf->device_lock); PRINTK("--- raid5d inactive\n");}/* * Private kernel thread for parity reconstruction after an unclean * shutdown. Reconstruction on spare drives in case of a failed drive * is done by the generic mdsyncd. */static void raid5syncd (void *data){ raid5_conf_t *conf = data; mddev_t *mddev = conf->mddev; if (!conf->resync_parity) return; if (conf->resync_parity == 2) return; down(&mddev->recovery_sem); if (md_do_sync(mddev,NULL)) { up(&mddev->recovery_sem); printk("raid5: resync aborted!\n"); return; } conf->resync_parity = 0; up(&mddev->recovery_sem); printk("raid5: resync finished.\n");}static int __check_consistency (mddev_t *mddev, int row){ raid5_conf_t *conf = mddev->private; kdev_t dev; struct buffer_head *bh[MD_SB_DISKS], *tmp = NULL; int i, ret = 0, nr = 0, count; struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; if (conf->working_disks != conf->raid_disks) goto out; tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); tmp->b_size = 4096; tmp->b_page = alloc_page(GFP_KERNEL); tmp->b_data = page_address(tmp->b_page); if (!tmp->b_data) goto out; md_clear_page(tmp->b_data); memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *)); for (i = 0; i < conf->raid_disks; i++) { dev = conf->disks[i].dev; set_blocksize(dev, 4096); bh[i] = bread(dev, row / 4, 4096); if (!bh[i]) break; nr++; } if (nr == conf->raid_disks) { bh_ptr[0] = tmp; count = 1; for (i = 1; i < nr; i++) { bh_ptr[count++] = bh[i]; if (count == MAX_XOR_BLOCKS) { xor_block(count, &bh_ptr[0]); count = 1; } } if (count != 1) { xor_block(count, &bh_ptr[0]); } if (memcmp(tmp->b_data, bh[0]->b_data, 4096)) ret = 1; } for (i = 0; i < conf->raid_disks; i++) { dev = conf->disks[i].dev; if (bh[i]) { bforget(bh[i]); bh[i] = NULL; } fsync_dev(dev); invalidate_buffers(dev); } free_page((unsigned long) tmp->b_data);out: if (tmp) kfree(tmp); return ret;}static int check_consistency (mddev_t *mddev){ if (__check_consistency(mddev, 0))/* * We are not checking this currently, as it's legitimate to have * an inconsistent array, at creation time. */ return 0; return 0;}static int raid5_run (mddev_t *mddev){ raid5_conf_t *conf; int i, j, raid_disk, memory; mdp_super_t *sb = mddev->sb; mdp_disk_t *desc; mdk_rdev_t *rdev; struct disk_info *disk; struct md_list_head *tmp; int start_recovery = 0; MOD_INC_USE_COUNT; if (sb->level != 5 && sb->level != 4) { printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level); MOD_DEC_USE_COUNT; return -EIO; } mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL); if ((conf = mddev->private) == NULL) goto abort; memset (conf, 0, sizeof (*conf)); conf->mddev = mddev; if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) goto abort; memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); conf->device_lock = MD_SPIN_LOCK_UNLOCKED; md_init_waitqueue_head(&conf->wait_for_stripe); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); conf->buffer_size = PAGE_SIZE; /* good default for rebuild */ PRINTK("raid5_run(md%d) called.\n", mdidx(mddev)); ITERATE_RDEV(mddev,rdev,tmp) { /* * This is important -- we are using the descriptor on * the disk only to get a pointer to the descriptor on * the main superblock, which might be more recent. */ desc = sb->disks + rdev->desc_nr; raid_disk = desc->raid_disk; disk = conf->disks + raid_disk; if (disk_faulty(desc)) { printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev)); if (!rdev->faulty) { MD_BUG(); goto abort; } disk->number = desc->number; disk->raid_disk = raid_disk; disk->dev = rdev->dev; disk->operational = 0; disk->write_only = 0; disk->spare = 0; disk->used_slot = 1; continue; } if (disk_active(desc)) { if (!disk_sync(desc)) { printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev)); MD_BUG(); goto abort; } if (raid_disk > sb->raid_disks) { printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev)); continue; } if (disk->operational) { printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk); continue; } printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk); disk->number = desc->number; disk->raid_disk = raid_disk; disk->dev = rdev->dev; disk->operational = 1; disk->used_slot = 1; conf->working_disks++; } else { /* * Must be a spare disk .. */ printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev)); disk->number = desc->number; disk->raid_disk = raid_disk; disk->dev = rdev->dev; disk->operational = 0; disk->write_only = 0; disk->spare = 1; disk->used_slot = 1; } } for (i = 0; i < MD_SB_DISKS; i++) { desc = sb->disks + i; raid_disk = desc->raid_disk; disk = conf->disks + raid_disk; if (disk_faulty(desc) && (raid_disk < sb->raid_disks) && !conf->disks[raid_disk].used_slot) { disk->number = desc->number; disk->raid_disk = raid_disk; disk->dev = MKDEV(0,0); disk->operational = 0; disk->write_only = 0; disk->spare = 0; disk->used_slot = 1; } } conf->raid_disks = sb->raid_disks; /* * 0 for a fully functional array, 1 for a degraded array. */ conf->failed_disks = conf->raid_disks - conf->working_disks; conf->mddev = mddev; conf->chunk_size = sb->chunk_size; conf->level = sb->level; conf->algorithm = sb->layout; conf->max_nr_stripes = NR_STRIPES;#if 0 for (i = 0; i < conf->raid_disks; i++) { if (!conf->disks[i].used_slot) { MD_BUG(); goto abort; } }#endif if (!conf->chunk_size || conf->chunk_size % 4) { printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev)); goto abort; } if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev)); goto abort; } if (conf->failed_disks > 1) { printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks); goto abort; } if (conf->working_disks != sb->raid_disks) { printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); start_recovery = 1; } if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) { printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n"); sb->state &= ~(1 << MD_SB_CLEAN); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -