📄 scsi_error.c
字号:
if( SCloop == NULL ) { continue; } /* * OK, we have a device that is having problems. Try and send * a bus device reset to it. * * FIXME(eric) - make sure we handle the case where multiple * commands to the same device have failed. They all must * get properly restarted. */ rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT); if( rtn == SUCCESS ) { rtn = scsi_test_unit_ready(SCloop); if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) ) { rtn = scsi_eh_retry_command(SCloop); if( rtn == SUCCESS ) { SCloop->host->host_failed--; scsi_eh_finish_command(&SCdone,SCloop); } } } } if( host->host_failed == 0 ) { ourrtn = TRUE; goto leave; } /* * If we ended up here, we have serious problems. The only thing left * to try is a full bus reset. If someone has grabbed the bus and isn't * letting go, then perhaps this will help. */ SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Try hard bus reset\n")); /* * We really want to loop over the various channels, and do this on * a channel by channel basis. We should also check to see if any * of the failed commands are on soft_reset devices, and if so, skip * the reset. */ for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next) {next_device: for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) { if( SCpnt->state != SCSI_STATE_FAILED && SCpnt->state != SCSI_STATE_TIMEOUT ) { continue; } /* * We have a failed command. Make sure there are no other failed * commands on the same channel that are timed out and implement a * soft reset. */ for(SDloop=host->host_queue; SDloop; SDloop = SDloop->next) { for(SCloop=SDloop->device_queue; SCloop; SCloop = SCloop->next) { if( SCloop->channel != SCpnt->channel ) { continue; } if( SCloop->state != SCSI_STATE_FAILED && SCloop->state != SCSI_STATE_TIMEOUT ) { continue; } if( SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT ) { /* * If this device uses the soft reset option, and this * is one of the devices acting up, then our only * option is to wait a bit, since the command is * supposedly still running. * * FIXME(eric) - right now we will just end up falling * through to the 'take device offline' case. * * FIXME(eric) - It is possible that the command completed * *after* the error recovery procedure started, and if this * is the case, we are worrying about nothing here. */ /* * Due to the spinlock, we will never get out of this * loop without a proper wait (DB) */ scsi_sleep(1 * HZ); goto next_device; } } } /* * We now know that we are able to perform a reset for the * bus that SCpnt points to. There are no soft-reset devices * with outstanding timed out commands. */ rtn = scsi_try_bus_reset(SCpnt); if( rtn == SUCCESS ) { for(SDloop=host->host_queue; SDloop; SDloop = SDloop->next) { for(SCloop=SDloop->device_queue; SCloop; SCloop = SCloop->next) { if( SCloop->channel != SCpnt->channel ) { continue; } if( SCloop->state != SCSI_STATE_FAILED && SCloop->state != SCSI_STATE_TIMEOUT ) { continue; } rtn = scsi_test_unit_ready(SCloop); if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) ) { rtn = scsi_eh_retry_command(SCloop); if( rtn == SUCCESS ) { SCpnt->host->host_failed--; scsi_eh_finish_command(&SCdone,SCloop); } } /* * If the bus reset worked, but we are still unable to * talk to the device, take it offline. * FIXME(eric) - is this really the correct thing to do? */ if( rtn != SUCCESS ) { SCloop->device->online = FALSE; SCloop->host->host_failed--; scsi_eh_finish_command(&SCdone,SCloop); } } } } } } if( host->host_failed == 0 ) { ourrtn = TRUE; goto leave; } /* * If we ended up here, we have serious problems. The only thing left * to try is a full host reset - perhaps the firmware on the device * crashed, or something like that. * * It is assumed that a succesful host reset will cause *all* information * about the command to be flushed from both the host adapter *and* the * device. * * FIXME(eric) - it isn't clear that devices that implement the soft reset * option can ever be cleared except via cycling the power. The problem is * that sending the host reset command will cause the host to forget * about the pending command, but the device won't forget. For now, we * skip the host reset option if any of the failed devices are configured * to use the soft reset option. */ for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next) {next_device2: for(SCpnt=SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) { if( SCpnt->state != SCSI_STATE_FAILED && SCpnt->state != SCSI_STATE_TIMEOUT ) { continue; } if( SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT ) { /* * If this device uses the soft reset option, and this * is one of the devices acting up, then our only * option is to wait a bit, since the command is * supposedly still running. * * FIXME(eric) - right now we will just end up falling * through to the 'take device offline' case. */ SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Unable to try hard host reset\n")); /* * Due to the spinlock, we will never get out of this * loop without a proper wait. (DB) */ scsi_sleep(1 * HZ); goto next_device2; } SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Try hard host reset\n")); /* * FIXME(eric) - we need to obtain a valid SCpnt to perform this call. */ rtn = scsi_try_host_reset(SCpnt); if( rtn == SUCCESS ) { /* * FIXME(eric) we assume that all commands are flushed from the * controller. We should get a DID_RESET for all of the commands * that were pending. We should ignore these so that we can * guarantee that we are in a consistent state. * * I believe this to be the case right now, but this needs to be * tested. */ for(SDloop=host->host_queue; SDloop; SDloop = SDloop->next) { for(SCloop=SDloop->device_queue; SCloop; SCloop = SCloop->next) { if( SCloop->state != SCSI_STATE_FAILED && SCloop->state != SCSI_STATE_TIMEOUT ) { continue; } rtn = scsi_test_unit_ready(SCloop); if( rtn == SUCCESS && scsi_unit_is_ready(SCloop) ) { rtn = scsi_eh_retry_command(SCloop); if( rtn == SUCCESS ) { SCpnt->host->host_failed--; scsi_eh_finish_command(&SCdone,SCloop); } } if( rtn != SUCCESS ) { SCloop->device->online = FALSE; SCloop->host->host_failed--; scsi_eh_finish_command(&SCdone,SCloop); } } } } } } /* * If we solved all of the problems, then let's rev up the engines again. */ if( host->host_failed == 0 ) { ourrtn = TRUE; goto leave; } /* * If the HOST RESET failed, then for now we assume that the entire host * adapter is too hosed to be of any use. For our purposes, however, it is * easier to simply take the devices offline that correspond to commands * that failed. */ SCSI_LOG_ERROR_RECOVERY(1,printk("scsi_unjam_host: Take device offline\n")); for(SDpnt=host->host_queue; SDpnt; SDpnt = SDpnt->next) { for(SCloop=SDpnt->device_queue; SCloop; SCloop = SCloop->next) { if( SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT ) { SCloop->device->online = FALSE; /* * This should pass the failure up to the top level driver, and * it will have to try and do something intelligent with it. */ SCloop->host->host_failed--; if( SCloop->state == SCSI_STATE_TIMEOUT ) { SCloop->result |= (DRIVER_TIMEOUT << 24); } SCSI_LOG_ERROR_RECOVERY(3,printk("Finishing command for device %d %x\n", SCloop->device->id, SCloop->result)); scsi_eh_finish_command(&SCdone,SCloop); } } } if( host->host_failed != 0 ) { panic("scsi_unjam_host: Miscount of number of failed commands.\n"); } SCSI_LOG_ERROR_RECOVERY(3,printk("scsi_unjam_host: Returning\n")); ourrtn = FALSE;leave: /* * We should have a list of commands that we 'finished' during the course of * error recovery. This should be the same as the list of commands that timed out * or failed. We are currently holding these things in a linked list - we didn't * put them in the bottom half queue because we wanted to keep things quiet while * we were working on recovery, and passing them up to the top level could easily * cause the top level to try and queue something else again. * * Start by marking that the host is no longer in error recovery. */ host->in_recovery = 0; /* * Take the list of commands, and stick them in the bottom half queue. * The current implementation of scsi_done will do this for us - if need * be we can create a special version of this function to do the * same job for us. */ for(SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone) { SCdone = SCpnt->bh_next; SCpnt->bh_next = NULL; scsi_done(SCpnt); } return (ourrtn);}#ifndef OSKIT/* * Function: scsi_error_handler * * Purpose: Handle errors/timeouts of scsi commands, try and clean up * and unjam the bus, and restart things. * * Arguments: host - host for which we are running. * * Returns: Never returns. * * Notes: This is always run in the context of a kernel thread. The * idea is that we start this thing up when the kernel starts * up (one per host that we detect), and it immediately goes to * sleep and waits for some event (i.e. failure). When this * takes place, we have the job of trying to unjam the bus * and restarting things. * */voidscsi_error_handler(void * data){ struct Scsi_Host * host = (struct Scsi_Host *) data; int rtn; struct semaphore sem = MUTEX_LOCKED; unsigned long flags; struct fs_struct *fs; lock_kernel(); /* * If we were started as result of loading a module, close all of the * user space pages. We don't need them, and if we didn't close them * they would be locked into memory. */ exit_mm(current); current->session = 1; current->pgrp = 1; /* Become as one with the init task */ exit_fs(current); /* current->fs->count--; */ fs = init_task.fs; current->fs = fs; atomic_inc(&fs->count); siginitsetinv(¤t->blocked, SHUTDOWN_SIGS); /* * Set the name of this process. */ sprintf(current->comm, "scsi_eh_%d", host->host_no); host->eh_wait = &sem; host->ehandler = current; unlock_kernel(); /* * Wake up the thread that created us. */ SCSI_LOG_ERROR_RECOVERY(3,printk("Wake up parent %d\n", host->eh_notify->count.counter)); up(host->eh_notify); while(1) { /* * If we get a signal, it means we are supposed to go * away and die. This typically happens if the user is * trying to unload a module. */ SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler sleeping\n")); down_interruptible (&sem); if (signal_pending(current) ) break; SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler waking up\n")); spin_lock_irqsave(&io_request_lock, flags); host->eh_active = 1; /* * We have a host that is failing for some reason. Figure out * what we need to do to get it up and online again (if we can). * If we fail, we end up taking the thing offline. */ if( host->hostt->eh_strategy_handler != NULL ) { rtn = host->hostt->eh_strategy_handler(host); } else { rtn = scsi_unjam_host(host); } host->eh_active = 0; /* * Note - if the above fails completely, the action is to take * individual devices offline and flush the queue of any * outstanding requests that may have been pending. When we * restart, we restart any I/O to any other devices on the bus * which are still online. */ scsi_restart_operations(host); /* The spinlock is really needed up to this point. (DB) */ spin_unlock_irqrestore(&io_request_lock, flags); } SCSI_LOG_ERROR_RECOVERY(1,printk("Error handler exiting\n")); /* * Make sure that nobody tries to wake us up again. */ host->eh_wait = NULL; /* * Knock this down too. From this point on, the host is flying * without a pilot. If this is because the module is being unloaded, * that's fine. If the user sent a signal to this thing, we are * potentially in real danger. */ host->in_recovery = 0; host->eh_active = 0; host->ehandler = NULL; /* * If anyone is waiting for us to exit (i.e. someone trying to unload * a driver), then wake up that process to let them know we are on * the way out the door. This may be overkill - I *think* that we * could probably just unload the driver and send the signal, and when * the error handling thread wakes up that it would just exit without * needing to touch any memory associated with the driver itself. */ if( host->eh_notify != NULL ) up(host->eh_notify);}#endif/* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically * adjust the settings for this buffer only. This must remain at the end * of the file. * --------------------------------------------------------------------------- * Local variables: * c-indent-level: 4 * c-brace-imaginary-offset: 0 * c-brace-offset: -4 * c-argdecl-indent: 4 * c-label-offset: -4 * c-continued-statement-offset: 4 * c-continued-brace-offset: 0 * indent-tabs-mode: nil * tab-width: 8 * End: */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -