📄 scsi_error.c

📁 Linux内核源代码为压缩文件是<<Linux内核>>一书中的源代码
💻 C
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
		}	}	/*	 * If we have corrected all of the problems, then we are done.	 */	if (host->host_failed == 0) {		ourrtn = TRUE;		goto leave;	}	/*	 * Either the abort wasn't appropriate, or it didn't succeed.	 * Now try a bus device reset.  Still, look to see whether we have	 * multiple devices that are jammed or not - if we have multiple devices,	 * it makes no sense to try BUS_DEVICE_RESET - we really would need	 * to try a BUS_RESET instead.	 *	 * Does this make sense - should we try BDR on each device individually?	 * Yes, definitely.	 */	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try BDR\n"));	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {		for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {			if (SCloop->state == SCSI_STATE_FAILED			    || SCloop->state == SCSI_STATE_TIMEOUT) {				break;			}		}		if (SCloop == NULL) {			continue;		}		/*		 * OK, we have a device that is having problems.  Try and send		 * a bus device reset to it.		 *		 * FIXME(eric) - make sure we handle the case where multiple		 * commands to the same device have failed. They all must		 * get properly restarted.		 */		rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT);		if (rtn == SUCCESS) {			rtn = scsi_test_unit_ready(SCloop);			if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {				rtn = scsi_eh_retry_command(SCloop);				if (rtn == SUCCESS) {					SCloop->host->host_failed--;					scsi_eh_finish_command(&SCdone, SCloop);				}			}		}	}	if (host->host_failed == 0) {		ourrtn = TRUE;		goto leave;	}	/*	 * If we ended up here, we have serious problems.  The only thing left	 * to try is a full bus reset.  If someone has grabbed the bus and isn't	 * letting go, then perhaps this will help.	 */	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n"));	/* 	 * We really want to loop over the various channels, and do this on	 * a channel by channel basis.  We should also check to see if any	 * of the failed commands are on soft_reset devices, and if so, skip	 * the reset.  	 */	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {	      next_device:		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {			if (SCpnt->state != SCSI_STATE_FAILED			    && SCpnt->state != SCSI_STATE_TIMEOUT) {				continue;			}			/*			 * We have a failed command.  Make sure there are no other failed			 * commands on the same channel that are timed out and implement a			 * soft reset.			 */			for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {				for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {					if (SCloop->channel != SCpnt->channel) {						continue;					}					if (SCloop->state != SCSI_STATE_FAILED					    && SCloop->state != SCSI_STATE_TIMEOUT) {						continue;					}					if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) {						/* 						 * If this device uses the soft reset option, and this						 * is one of the devices acting up, then our only						 * option is to wait a bit, since the command is						 * supposedly still running.  						 *						 * FIXME(eric) - right now we will just end up falling						 * through to the 'take device offline' case.						 *						 * FIXME(eric) - It is possible that the command completed						 * *after* the error recovery procedure started, and if this						 * is the case, we are worrying about nothing here.						 */						scsi_sleep(1 * HZ);						goto next_device;					}				}			}			/*			 * We now know that we are able to perform a reset for the			 * bus that SCpnt points to.  There are no soft-reset devices			 * with outstanding timed out commands.			 */			rtn = scsi_try_bus_reset(SCpnt);			if (rtn == SUCCESS) {				for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {					for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {						if (SCloop->channel != SCpnt->channel) {							continue;						}						if (SCloop->state != SCSI_STATE_FAILED						    && SCloop->state != SCSI_STATE_TIMEOUT) {							continue;						}						rtn = scsi_test_unit_ready(SCloop);						if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {							rtn = scsi_eh_retry_command(SCloop);							if (rtn == SUCCESS) {								SCpnt->host->host_failed--;								scsi_eh_finish_command(&SCdone, SCloop);							}						}						/*						 * If the bus reset worked, but we are still unable to						 * talk to the device, take it offline.						 * FIXME(eric) - is this really the correct thing to do?						 */						if (rtn != SUCCESS) {							SCloop->device->online = FALSE;							SCloop->host->host_failed--;							scsi_eh_finish_command(&SCdone, SCloop);						}					}				}			}		}	}	if (host->host_failed == 0) {		ourrtn = TRUE;		goto leave;	}	/*	 * If we ended up here, we have serious problems.  The only thing left	 * to try is a full host reset - perhaps the firmware on the device	 * crashed, or something like that.	 *	 * It is assumed that a succesful host reset will cause *all* information	 * about the command to be flushed from both the host adapter *and* the	 * device.	 *	 * FIXME(eric) - it isn't clear that devices that implement the soft reset	 * option can ever be cleared except via cycling the power.  The problem is	 * that sending the host reset command will cause the host to forget	 * about the pending command, but the device won't forget.  For now, we	 * skip the host reset option if any of the failed devices are configured	 * to use the soft reset option.	 */	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {	      next_device2:		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {			if (SCpnt->state != SCSI_STATE_FAILED			    && SCpnt->state != SCSI_STATE_TIMEOUT) {				continue;			}			if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) {				/* 				 * If this device uses the soft reset option, and this				 * is one of the devices acting up, then our only				 * option is to wait a bit, since the command is				 * supposedly still running.  				 *				 * FIXME(eric) - right now we will just end up falling				 * through to the 'take device offline' case.				 */				SCSI_LOG_ERROR_RECOVERY(3,							printk("scsi_unjam_host: Unable to try hard host reset\n"));				/*				 * Due to the spinlock, we will never get out of this				 * loop without a proper wait. (DB)				 */				scsi_sleep(1 * HZ);				goto next_device2;			}			SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n"));			/*			 * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.			 */			rtn = scsi_try_host_reset(SCpnt);			if (rtn == SUCCESS) {				/*				 * FIXME(eric) we assume that all commands are flushed from the				 * controller.  We should get a DID_RESET for all of the commands				 * that were pending.  We should ignore these so that we can				 * guarantee that we are in a consistent state.				 *				 * I believe this to be the case right now, but this needs to be				 * tested.				 */				for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {					for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {						if (SCloop->state != SCSI_STATE_FAILED						    && SCloop->state != SCSI_STATE_TIMEOUT) {							continue;						}						rtn = scsi_test_unit_ready(SCloop);						if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {							rtn = scsi_eh_retry_command(SCloop);							if (rtn == SUCCESS) {								SCpnt->host->host_failed--;								scsi_eh_finish_command(&SCdone, SCloop);							}						}						if (rtn != SUCCESS) {							SCloop->device->online = FALSE;							SCloop->host->host_failed--;							scsi_eh_finish_command(&SCdone, SCloop);						}					}				}			}		}	}	/*	 * If we solved all of the problems, then let's rev up the engines again.	 */	if (host->host_failed == 0) {		ourrtn = TRUE;		goto leave;	}	/*	 * If the HOST RESET failed, then for now we assume that the entire host	 * adapter is too hosed to be of any use.  For our purposes, however, it is	 * easier to simply take the devices offline that correspond to commands	 * that failed.	 */	SCSI_LOG_ERROR_RECOVERY(1, printk("scsi_unjam_host: Take device offline\n"));	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {		for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {			if (SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT) {				SCloop->device->online = FALSE;				/*				 * This should pass the failure up to the top level driver, and				 * it will have to try and do something intelligent with it.				 */				SCloop->host->host_failed--;				if (SCloop->state == SCSI_STATE_TIMEOUT) {					SCloop->result |= (DRIVER_TIMEOUT << 24);				}				SCSI_LOG_ERROR_RECOVERY(3, printk("Finishing command for device %d %x\n",				    SCloop->device->id, SCloop->result));				scsi_eh_finish_command(&SCdone, SCloop);			}		}	}	if (host->host_failed != 0) {		panic("scsi_unjam_host: Miscount of number of failed commands.\n");	}	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Returning\n"));	ourrtn = FALSE;      leave:	/*	 * We should have a list of commands that we 'finished' during the course of	 * error recovery.  This should be the same as the list of commands that timed out	 * or failed.  We are currently holding these things in a linked list - we didn't	 * put them in the bottom half queue because we wanted to keep things quiet while	 * we were working on recovery, and passing them up to the top level could easily	 * cause the top level to try and queue something else again.	 *	 * Start by marking that the host is no longer in error recovery.	 */	host->in_recovery = 0;	/*	 * Take the list of commands, and stick them in the bottom half queue.	 * The current implementation of scsi_done will do this for us - if need	 * be we can create a special version of this function to do the	 * same job for us.	 */	for (SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone) {		SCdone = SCpnt->bh_next;		SCpnt->bh_next = NULL;                /*                 * Oh, this is a vile hack.  scsi_done() expects a timer                 * to be running on the command.  If there isn't, it assumes                 * that the command has actually timed out, and a timer                 * handler is running.  That may well be how we got into                 * this fix, but right now things are stable.  We add                 * a timer back again so that we can report completion.                 * scsi_done() will immediately remove said timer from                 * the command, and then process it.                 */		scsi_add_timer(SCpnt, 100, scsi_eh_times_out);		scsi_done(SCpnt);	}	return (ourrtn);}/* * Function:  scsi_error_handler * * Purpose:     Handle errors/timeouts of scsi commands, try and clean up *              and unjam the bus, and restart things. * * Arguments:   host    - host for which we are running. * * Returns:     Never returns. * * Notes:       This is always run in the context of a kernel thread.  The *              idea is that we start this thing up when the kernel starts *              up (one per host that we detect), and it immediately goes to *              sleep and waits for some event (i.e. failure).  When this *              takes place, we have the job of trying to unjam the bus *              and restarting things. * */void scsi_error_handler(void *data){	struct Scsi_Host *host = (struct Scsi_Host *) data;	int rtn;	DECLARE_MUTEX_LOCKED(sem);        /*         * We only listen to signals if the HA was loaded as a module.         * If the HA was compiled into the kernel, then we don't listen         * to any signals.         */        if( host->loaded_as_module ) {	siginitsetinv(&current->blocked, SHUTDOWN_SIGS);	} else {	siginitsetinv(&current->blocked, 0);        }	lock_kernel();	/*	 *    Flush resources	 */	daemonize();	/*	 * Set the name of this process.	 */	sprintf(current->comm, "scsi_eh_%d", host->host_no);	host->eh_wait = &sem;	host->ehandler = current;	unlock_kernel();	/*	 * Wake up the thread that created us.	 */	SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent %d\n", host->eh_notify->count.counter));	up(host->eh_notify);	while (1) {		/*		 * If we get a signal, it means we are supposed to go		 * away and die.  This typically happens if the user is		 * trying to unload a module.		 */		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleeping\n"));		/*		 * Note - we always use down_interruptible with the semaphore		 * even if the module was loaded as part of the kernel.  The		 * reason is that down() will cause this thread to be counted		 * in the load average as a running process, and down		 * interruptible doesn't.  Given that we need to allow this		 * thread to die if the driver was loaded as a module, using		 * semaphores isn't unreasonable.		 */		down_interruptible(&sem);		if( host->loaded_as_module ) {			if (signal_pending(current))				break;                }		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking up\n"));		host->eh_active = 1;		/*		 * We have a host that is failing for some reason.  Figure out		 * what we need to do to get it up and online again (if we can).		 * If we fail, we end up taking the thing offline.		 */		if (host->hostt->eh_strategy_handler != NULL) {			rtn = host->hostt->eh_strategy_handler(host);		} else {			rtn = scsi_unjam_host(host);		}		host->eh_active = 0;		/*		 * Note - if the above fails completely, the action is to take		 * individual devices offline and flush the queue of any		 * outstanding requests that may have been pending.  When we		 * restart, we restart any I/O to any other devices on the bus		 * which are still online.		 */		scsi_restart_operations(host);	}	SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exiting\n"));	/*	 * Make sure that nobody tries to wake us up again.	 */	host->eh_wait = NULL;	/*	 * Knock this down too.  From this point on, the host is flying	 * without a pilot.  If this is because the module is being unloaded,	 * that's fine.  If the user sent a signal to this thing, we are	 * potentially in real danger.	 */	host->in_recovery = 0;	host->eh_active = 0;	host->ehandler = NULL;	/*	 * If anyone is waiting for us to exit (i.e. someone trying to unload	 * a driver), then wake up that process to let them know we are on	 * the way out the door.  This may be overkill - I *think* that we	 * could probably just unload the driver and send the signal, and when	 * the error handling thread wakes up that it would just exit without	 * needing to touch any memory associated with the driver itself.	 */	if (host->eh_notify != NULL)		up(host->eh_notify);}/* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically * adjust the settings for this buffer only.  This must remain at the end * of the file. * --------------------------------------------------------------------------- * Local variables: * c-indent-level: 4 * c-brace-imaginary-offset: 0 * c-brace-offset: -4 * c-argdecl-indent: 4 * c-label-offset: -4 * c-continued-statement-offset: 4 * c-continued-brace-offset: 0 * indent-tabs-mode: nil * tab-width: 8 * End: */
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -