📄 scsi_error.c
字号:
} } /* * If we have corrected all of the problems, then we are done. */ if (host->host_failed == 0) { ourrtn = TRUE; goto leave; } /* * Either the abort wasn't appropriate, or it didn't succeed. * Now try a bus device reset. Still, look to see whether we have * multiple devices that are jammed or not - if we have multiple devices, * it makes no sense to try BUS_DEVICE_RESET - we really would need * to try a BUS_RESET instead. * * Does this make sense - should we try BDR on each device individually? * Yes, definitely. */ SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try BDR\n")); for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) { for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) { if (SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT) { break; } } if (SCloop == NULL) { continue; } /* * OK, we have a device that is having problems. Try and send * a bus device reset to it. * * FIXME(eric) - make sure we handle the case where multiple * commands to the same device have failed. They all must * get properly restarted. */ rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT); if (rtn == SUCCESS) { rtn = scsi_test_unit_ready(SCloop); if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) { rtn = scsi_eh_retry_command(SCloop); if (rtn == SUCCESS) { SCloop->host->host_failed--; scsi_eh_finish_command(&SCdone, SCloop); } } } } if (host->host_failed == 0) { ourrtn = TRUE; goto leave; } /* * If we ended up here, we have serious problems. The only thing left * to try is a full bus reset. If someone has grabbed the bus and isn't * letting go, then perhaps this will help. */ SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n")); /* * We really want to loop over the various channels, and do this on * a channel by channel basis. We should also check to see if any * of the failed commands are on soft_reset devices, and if so, skip * the reset. */ for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) { next_device: for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) { if (SCpnt->state != SCSI_STATE_FAILED && SCpnt->state != SCSI_STATE_TIMEOUT) { continue; } /* * We have a failed command. Make sure there are no other failed * commands on the same channel that are timed out and implement a * soft reset. */ for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) { for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) { if (SCloop->channel != SCpnt->channel) { continue; } if (SCloop->state != SCSI_STATE_FAILED && SCloop->state != SCSI_STATE_TIMEOUT) { continue; } if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) { /* * If this device uses the soft reset option, and this * is one of the devices acting up, then our only * option is to wait a bit, since the command is * supposedly still running. * * FIXME(eric) - right now we will just end up falling * through to the 'take device offline' case. * * FIXME(eric) - It is possible that the command completed * *after* the error recovery procedure started, and if this * is the case, we are worrying about nothing here. */ scsi_sleep(1 * HZ); goto next_device; } } } /* * We now know that we are able to perform a reset for the * bus that SCpnt points to. There are no soft-reset devices * with outstanding timed out commands. */ rtn = scsi_try_bus_reset(SCpnt); if (rtn == SUCCESS) { for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) { for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) { if (SCloop->channel != SCpnt->channel) { continue; } if (SCloop->state != SCSI_STATE_FAILED && SCloop->state != SCSI_STATE_TIMEOUT) { continue; } rtn = scsi_test_unit_ready(SCloop); if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) { rtn = scsi_eh_retry_command(SCloop); if (rtn == SUCCESS) { SCpnt->host->host_failed--; scsi_eh_finish_command(&SCdone, SCloop); } } /* * If the bus reset worked, but we are still unable to * talk to the device, take it offline. * FIXME(eric) - is this really the correct thing to do? */ if (rtn != SUCCESS) { SCloop->device->online = FALSE; SCloop->host->host_failed--; scsi_eh_finish_command(&SCdone, SCloop); } } } } } } if (host->host_failed == 0) { ourrtn = TRUE; goto leave; } /* * If we ended up here, we have serious problems. The only thing left * to try is a full host reset - perhaps the firmware on the device * crashed, or something like that. * * It is assumed that a succesful host reset will cause *all* information * about the command to be flushed from both the host adapter *and* the * device. * * FIXME(eric) - it isn't clear that devices that implement the soft reset * option can ever be cleared except via cycling the power. The problem is * that sending the host reset command will cause the host to forget * about the pending command, but the device won't forget. For now, we * skip the host reset option if any of the failed devices are configured * to use the soft reset option. */ for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) { next_device2: for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) { if (SCpnt->state != SCSI_STATE_FAILED && SCpnt->state != SCSI_STATE_TIMEOUT) { continue; } if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) { /* * If this device uses the soft reset option, and this * is one of the devices acting up, then our only * option is to wait a bit, since the command is * supposedly still running. * * FIXME(eric) - right now we will just end up falling * through to the 'take device offline' case. */ SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Unable to try hard host reset\n")); /* * Due to the spinlock, we will never get out of this * loop without a proper wait. (DB) */ scsi_sleep(1 * HZ); goto next_device2; } SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n")); /* * FIXME(eric) - we need to obtain a valid SCpnt to perform this call. */ rtn = scsi_try_host_reset(SCpnt); if (rtn == SUCCESS) { /* * FIXME(eric) we assume that all commands are flushed from the * controller. We should get a DID_RESET for all of the commands * that were pending. We should ignore these so that we can * guarantee that we are in a consistent state. * * I believe this to be the case right now, but this needs to be * tested. */ for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) { for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) { if (SCloop->state != SCSI_STATE_FAILED && SCloop->state != SCSI_STATE_TIMEOUT) { continue; } rtn = scsi_test_unit_ready(SCloop); if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) { rtn = scsi_eh_retry_command(SCloop); if (rtn == SUCCESS) { SCpnt->host->host_failed--; scsi_eh_finish_command(&SCdone, SCloop); } } if (rtn != SUCCESS) { SCloop->device->online = FALSE; SCloop->host->host_failed--; scsi_eh_finish_command(&SCdone, SCloop); } } } } } } /* * If we solved all of the problems, then let's rev up the engines again. */ if (host->host_failed == 0) { ourrtn = TRUE; goto leave; } /* * If the HOST RESET failed, then for now we assume that the entire host * adapter is too hosed to be of any use. For our purposes, however, it is * easier to simply take the devices offline that correspond to commands * that failed. */ SCSI_LOG_ERROR_RECOVERY(1, printk("scsi_unjam_host: Take device offline\n")); for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) { for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) { if (SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT) { SCloop->device->online = FALSE; /* * This should pass the failure up to the top level driver, and * it will have to try and do something intelligent with it. */ SCloop->host->host_failed--; if (SCloop->state == SCSI_STATE_TIMEOUT) { SCloop->result |= (DRIVER_TIMEOUT << 24); } SCSI_LOG_ERROR_RECOVERY(3, printk("Finishing command for device %d %x\n", SCloop->device->id, SCloop->result)); scsi_eh_finish_command(&SCdone, SCloop); } } } if (host->host_failed != 0) { panic("scsi_unjam_host: Miscount of number of failed commands.\n"); } SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Returning\n")); ourrtn = FALSE; leave: /* * We should have a list of commands that we 'finished' during the course of * error recovery. This should be the same as the list of commands that timed out * or failed. We are currently holding these things in a linked list - we didn't * put them in the bottom half queue because we wanted to keep things quiet while * we were working on recovery, and passing them up to the top level could easily * cause the top level to try and queue something else again. * * Start by marking that the host is no longer in error recovery. */ host->in_recovery = 0; /* * Take the list of commands, and stick them in the bottom half queue. * The current implementation of scsi_done will do this for us - if need * be we can create a special version of this function to do the * same job for us. */ for (SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone) { SCdone = SCpnt->bh_next; SCpnt->bh_next = NULL; /* * Oh, this is a vile hack. scsi_done() expects a timer * to be running on the command. If there isn't, it assumes * that the command has actually timed out, and a timer * handler is running. That may well be how we got into * this fix, but right now things are stable. We add * a timer back again so that we can report completion. * scsi_done() will immediately remove said timer from * the command, and then process it. */ scsi_add_timer(SCpnt, 100, scsi_eh_times_out); scsi_done(SCpnt); } return (ourrtn);}/* * Function: scsi_error_handler * * Purpose: Handle errors/timeouts of scsi commands, try and clean up * and unjam the bus, and restart things. * * Arguments: host - host for which we are running. * * Returns: Never returns. * * Notes: This is always run in the context of a kernel thread. The * idea is that we start this thing up when the kernel starts * up (one per host that we detect), and it immediately goes to * sleep and waits for some event (i.e. failure). When this * takes place, we have the job of trying to unjam the bus * and restarting things. * */void scsi_error_handler(void *data){ struct Scsi_Host *host = (struct Scsi_Host *) data; int rtn; DECLARE_MUTEX_LOCKED(sem); /* * We only listen to signals if the HA was loaded as a module. * If the HA was compiled into the kernel, then we don't listen * to any signals. */ if( host->loaded_as_module ) { siginitsetinv(¤t->blocked, SHUTDOWN_SIGS); } else { siginitsetinv(¤t->blocked, 0); } lock_kernel(); /* * Flush resources */ daemonize(); /* * Set the name of this process. */ sprintf(current->comm, "scsi_eh_%d", host->host_no); host->eh_wait = &sem; host->ehandler = current; unlock_kernel(); /* * Wake up the thread that created us. */ SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent %d\n", host->eh_notify->count.counter)); up(host->eh_notify); while (1) { /* * If we get a signal, it means we are supposed to go * away and die. This typically happens if the user is * trying to unload a module. */ SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleeping\n")); /* * Note - we always use down_interruptible with the semaphore * even if the module was loaded as part of the kernel. The * reason is that down() will cause this thread to be counted * in the load average as a running process, and down * interruptible doesn't. Given that we need to allow this * thread to die if the driver was loaded as a module, using * semaphores isn't unreasonable. */ down_interruptible(&sem); if( host->loaded_as_module ) { if (signal_pending(current)) break; } SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking up\n")); host->eh_active = 1; /* * We have a host that is failing for some reason. Figure out * what we need to do to get it up and online again (if we can). * If we fail, we end up taking the thing offline. */ if (host->hostt->eh_strategy_handler != NULL) { rtn = host->hostt->eh_strategy_handler(host); } else { rtn = scsi_unjam_host(host); } host->eh_active = 0; /* * Note - if the above fails completely, the action is to take * individual devices offline and flush the queue of any * outstanding requests that may have been pending. When we * restart, we restart any I/O to any other devices on the bus * which are still online. */ scsi_restart_operations(host); } SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exiting\n")); /* * Make sure that nobody tries to wake us up again. */ host->eh_wait = NULL; /* * Knock this down too. From this point on, the host is flying * without a pilot. If this is because the module is being unloaded, * that's fine. If the user sent a signal to this thing, we are * potentially in real danger. */ host->in_recovery = 0; host->eh_active = 0; host->ehandler = NULL; /* * If anyone is waiting for us to exit (i.e. someone trying to unload * a driver), then wake up that process to let them know we are on * the way out the door. This may be overkill - I *think* that we * could probably just unload the driver and send the signal, and when * the error handling thread wakes up that it would just exit without * needing to touch any memory associated with the driver itself. */ if (host->eh_notify != NULL) up(host->eh_notify);}/* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically * adjust the settings for this buffer only. This must remain at the end * of the file. * --------------------------------------------------------------------------- * Local variables: * c-indent-level: 4 * c-brace-imaginary-offset: 0 * c-brace-offset: -4 * c-argdecl-indent: 4 * c-label-offset: -4 * c-continued-statement-offset: 4 * c-continued-brace-offset: 0 * indent-tabs-mode: nil * tab-width: 8 * End: */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -