📄 raid5.c
字号:
} } /* * handle errors in spares (during reconstruction) */ if (conf->spare) { disk = conf->spare; if (disk->dev == dev) { printk (KERN_ALERT "raid5: Disk failure on spare %s\n", partition_name (dev)); if (!conf->spare->operational) { /* probably a SET_DISK_FAULTY ioctl */ return -EIO; } disk->operational = 0; disk->write_only = 0; conf->spare = NULL; mark_disk_faulty(sb->disks+disk->number); mark_disk_nonsync(sb->disks+disk->number); mark_disk_inactive(sb->disks+disk->number); sb->spare_disks--; sb->working_disks--; sb->failed_disks++; mddev->sb_dirty = 1; md_wakeup_thread(conf->thread); return 0; } } MD_BUG(); return -EIO;} /* * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */static unsigned long raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks, unsigned int data_disks, unsigned int * dd_idx, unsigned int * pd_idx, raid5_conf_t *conf){ unsigned long stripe; unsigned long chunk_number; unsigned int chunk_offset; unsigned long new_sector; int sectors_per_chunk = conf->chunk_size >> 9; /* First compute the information on this sector */ /* * Compute the chunk number and the sector offset inside the chunk */ chunk_number = r_sector / sectors_per_chunk; chunk_offset = r_sector % sectors_per_chunk; /* * Compute the stripe number */ stripe = chunk_number / data_disks; /* * Compute the data disk and parity disk indexes inside the stripe */ *dd_idx = chunk_number % data_disks; /* * Select the parity disk based on the user selected algorithm. */ if (conf->level == 4) *pd_idx = data_disks; else switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: *pd_idx = data_disks - stripe % raid_disks; if (*dd_idx >= *pd_idx) (*dd_idx)++; break; case ALGORITHM_RIGHT_ASYMMETRIC: *pd_idx = stripe % raid_disks; if (*dd_idx >= *pd_idx) (*dd_idx)++; break; case ALGORITHM_LEFT_SYMMETRIC: *pd_idx = data_disks - stripe % raid_disks; *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: *pd_idx = stripe % raid_disks; *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; break; default: printk ("raid5: unsupported algorithm %d\n", conf->algorithm); } /* * Finally, compute the new sector number */ new_sector = stripe * sectors_per_chunk + chunk_offset; return new_sector;}#if 0static unsigned long compute_blocknr(struct stripe_head *sh, int i){ raid5_conf_t *conf = sh->raid_conf; int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; unsigned long new_sector = sh->sector, check; int sectors_per_chunk = conf->chunk_size >> 9; unsigned long stripe = new_sector / sectors_per_chunk; int chunk_offset = new_sector % sectors_per_chunk; int chunk_number, dummy1, dummy2, dd_idx = i; unsigned long r_sector, blocknr; switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: if (i > sh->pd_idx) i--; break; case ALGORITHM_LEFT_SYMMETRIC: case ALGORITHM_RIGHT_SYMMETRIC: if (i < sh->pd_idx) i += raid_disks; i -= (sh->pd_idx + 1); break; default: printk ("raid5: unsupported algorithm %d\n", conf->algorithm); } chunk_number = stripe * data_disks + i; r_sector = chunk_number * sectors_per_chunk + chunk_offset; blocknr = r_sector / (sh->size >> 9); check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { printk("compute_blocknr: map not correct\n"); return 0; } return blocknr;}#endif#define check_xor() do { \ if (count == MAX_XOR_BLOCKS) { \ xor_block(count, bh_ptr); \ count = 1; \ } \ } while(0)static void compute_block(struct stripe_head *sh, int dd_idx){ raid5_conf_t *conf = sh->raid_conf; int i, count, disks = conf->raid_disks; struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh; PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx); memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size); bh_ptr[0] = sh->bh_cache[dd_idx]; count = 1; for (i = disks ; i--; ) { if (i == dd_idx) continue; bh = sh->bh_cache[i]; if (buffer_uptodate(bh)) bh_ptr[count++] = bh; else printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i); check_xor(); } if (count != 1) xor_block(count, bh_ptr); set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);}static void compute_parity(struct stripe_head *sh, int method){ raid5_conf_t *conf = sh->raid_conf; int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; struct buffer_head *chosen[MD_SB_DISKS]; PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method); memset(chosen, 0, sizeof(chosen)); count = 1; bh_ptr[0] = sh->bh_cache[pd_idx]; switch(method) { case READ_MODIFY_WRITE: if (!buffer_uptodate(sh->bh_cache[pd_idx])) BUG(); for (i=disks ; i-- ;) { if (i==pd_idx) continue; if (sh->bh_write[i] && buffer_uptodate(sh->bh_cache[i])) { bh_ptr[count++] = sh->bh_cache[i]; chosen[i] = sh->bh_write[i]; sh->bh_write[i] = sh->bh_write[i]->b_reqnext; chosen[i]->b_reqnext = sh->bh_written[i]; sh->bh_written[i] = chosen[i]; check_xor(); } } break; case RECONSTRUCT_WRITE: memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size); for (i= disks; i-- ;) if (i!=pd_idx && sh->bh_write[i]) { chosen[i] = sh->bh_write[i]; sh->bh_write[i] = sh->bh_write[i]->b_reqnext; chosen[i]->b_reqnext = sh->bh_written[i]; sh->bh_written[i] = chosen[i]; } break; case CHECK_PARITY: break; } if (count>1) { xor_block(count, bh_ptr); count = 1; } for (i = disks; i--;) if (chosen[i]) { struct buffer_head *bh = sh->bh_cache[i]; char *bdata; bdata = bh_kmap(chosen[i]); memcpy(bh->b_data, bdata,sh->size); bh_kunmap(chosen[i]); set_bit(BH_Lock, &bh->b_state); mark_buffer_uptodate(bh, 1); } switch(method) { case RECONSTRUCT_WRITE: case CHECK_PARITY: for (i=disks; i--;) if (i != pd_idx) { bh_ptr[count++] = sh->bh_cache[i]; check_xor(); } break; case READ_MODIFY_WRITE: for (i = disks; i--;) if (chosen[i]) { bh_ptr[count++] = sh->bh_cache[i]; check_xor(); } } if (count != 1) xor_block(count, bh_ptr); if (method != CHECK_PARITY) { mark_buffer_uptodate(sh->bh_cache[pd_idx], 1); set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state); } else mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);}static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw){ struct buffer_head **bhp; raid5_conf_t *conf = sh->raid_conf; PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector); spin_lock(&sh->lock); spin_lock_irq(&conf->device_lock); bh->b_reqnext = NULL; if (rw == READ) bhp = &sh->bh_read[dd_idx]; else bhp = &sh->bh_write[dd_idx]; while (*bhp) { printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector); bhp = & (*bhp)->b_reqnext; } *bhp = bh; spin_unlock_irq(&conf->device_lock); spin_unlock(&sh->lock); PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);}/* * handle_stripe - do things to a stripe. * * We lock the stripe and then examine the state of various bits * to see what needs to be done. * Possible results: * return some read request which now have data * return some write requests which are safely on disc * schedule a read on some buffers * schedule a write of some buffers * return confirmation of parity correctness * * Parity calculations are done inside the stripe lock * buffers are taken off read_list or write_list, and bh_cache buffers * get BH_Lock set before the stripe lock is released. * */ static void handle_stripe(struct stripe_head *sh){ raid5_conf_t *conf = sh->raid_conf; int disks = conf->raid_disks; struct buffer_head *return_ok= NULL, *return_fail = NULL; int action[MD_SB_DISKS]; int i; int syncing; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; int failed_num=0; struct buffer_head *bh; PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx); memset(action, 0, sizeof(action)); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); syncing = test_bit(STRIPE_SYNCING, &sh->state); /* Now to look around and see what can be done */ for (i=disks; i--; ) { bh = sh->bh_cache[i]; PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, bh->b_state, sh->bh_read[i], sh->bh_write[i], sh->bh_written[i]); /* maybe we can reply to a read */ if (buffer_uptodate(bh) && sh->bh_read[i]) { struct buffer_head *rbh, *rbh2; PRINTK("Return read for disc %d\n", i); spin_lock_irq(&conf->device_lock); rbh = sh->bh_read[i]; sh->bh_read[i] = NULL; spin_unlock_irq(&conf->device_lock); while (rbh) { char *bdata; bdata = bh_kmap(rbh); memcpy(bdata, bh->b_data, bh->b_size); bh_kunmap(rbh); rbh2 = rbh->b_reqnext; rbh->b_reqnext = return_ok; return_ok = rbh; rbh = rbh2; } } /* now count some things */ if (buffer_locked(bh)) locked++; if (buffer_uptodate(bh)) uptodate++; if (sh->bh_read[i]) to_read++; if (sh->bh_write[i]) to_write++; if (sh->bh_written[i]) written++; if (!conf->disks[i].operational) { failed++; failed_num = i; } } PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n", locked, uptodate, to_read, to_write, failed, failed_num); /* check if the array has lost two devices and, if so, some requests might * need to be failed */ if (failed > 1 && to_read+to_write) { for (i=disks; i--; ) { /* fail all writes first */ if (sh->bh_write[i]) to_write--; while ((bh = sh->bh_write[i])) { sh->bh_write[i] = bh->b_reqnext; bh->b_reqnext = return_fail; return_fail = bh; } /* fail any reads if this device is non-operational */ if (!conf->disks[i].operational) { spin_lock_irq(&conf->device_lock); if (sh->bh_read[i]) to_read--; while ((bh = sh->bh_read[i])) { sh->bh_read[i] = bh->b_reqnext; bh->b_reqnext = return_fail; return_fail = bh; } spin_unlock_irq(&conf->device_lock); } } } if (failed > 1 && syncing) { md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,0); clear_bit(STRIPE_SYNCING, &sh->state); syncing = 0; } /* might be able to return some write requests if the parity block * is safe, or on a failed drive */ bh = sh->bh_cache[sh->pd_idx]; if ( written && ( (conf->disks[sh->pd_idx].operational && !buffer_locked(bh) && buffer_uptodate(bh)) || (failed == 1 && failed_num == sh->pd_idx)) ) { /* any written block on a uptodate or failed drive can be returned */ for (i=disks; i--; ) if (sh->bh_written[i]) { bh = sh->bh_cache[i]; if (!conf->disks[sh->pd_idx].operational || (!buffer_locked(bh) && buffer_uptodate(bh)) ) { /* maybe we can return some write requests */ struct buffer_head *wbh, *wbh2; PRINTK("Return write for disc %d\n", i); wbh = sh->bh_written[i]; sh->bh_written[i] = NULL; while (wbh) { wbh2 = wbh->b_reqnext; wbh->b_reqnext = return_ok; return_ok = wbh; wbh = wbh2; } } } } /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests */ if (to_read || (syncing && (uptodate+failed < disks))) { for (i=disks; i--;) { bh = sh->bh_cache[i]; if (!buffer_locked(bh) && !buffer_uptodate(bh) && (sh->bh_read[i] || syncing || (failed && sh->bh_read[failed_num]))) { /* we would like to get this block, possibly * by computing it, but we might not be able to */ if (uptodate == disks-1) { PRINTK("Computing block %d\n", i); compute_block(sh, i); uptodate++; } else if (conf->disks[i].operational) { set_bit(BH_Lock, &bh->b_state); action[i] = READ+1; /* if I am just reading this block and we don't have a failed drive, or any pending writes then sidestep the cache */ if (sh->bh_page[i]) BUG(); if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext && ! syncing && !failed && !to_write) { sh->bh_page[i] = sh->bh_cache[i]->b_page; sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page; sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data; } locked++; PRINTK("Reading block %d (sync=%d)\n", i, syncing); if (syncing) md_sync_acct(conf->disks[i].dev, bh->b_size>>9); } } } set_bit(STRIPE_HANDLE, &sh->state); } /* now to consider writing and what else, if anything should be read */ if (to_write) { int rmw=0, rcw=0; for (i=disks ; i--;) { /* would I have to read this buffer for read_modify_write */ bh = sh->bh_cache[i]; if ((sh->bh_write[i] || i == sh->pd_idx) && (!buffer_locked(bh) || sh->bh_page[i]) && !buffer_uptodate(bh)) { if (conf->disks[i].operational /* && !(conf->resync_parity && i == sh->pd_idx) */ ) rmw++; else rmw += 2*disks; /* cannot read it */ } /* Would I have to read this buffer for reconstruct_write */ if (!sh->bh_write[i] && i != sh->pd_idx && (!buffer_locked(bh) || sh->bh_page[i]) && !buffer_uptodate(bh)) { if (conf->disks[i].operational) rcw++; else rcw += 2*disks; } } PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); if (rmw < rcw && rmw > 0) /* prefer read-modify-write, but need to get some data */ for (i=disks; i--;) { bh = sh->bh_cache[i]; if ((sh->bh_write[i] || i == sh->pd_idx) && !buffer_locked(bh) && !buffer_uptodate(bh) && conf->disks[i].operational) { if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { PRINTK("Read_old block %d for r-m-w\n", i); set_bit(BH_Lock, &bh->b_state); action[i] = READ+1; locked++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -