eeh.c

来自「底层驱动开发」· C语言 代码 · 共 944 行 · 第 1/2 页

C
944
字号
	struct eeh_event	*event;	while (1) {		spin_lock_irqsave(&eeh_eventlist_lock, flags);		event = NULL;		if (!list_empty(&eeh_eventlist)) {			event = list_entry(eeh_eventlist.next, struct eeh_event, list);			list_del(&event->list);		}		spin_unlock_irqrestore(&eeh_eventlist_lock, flags);		if (event == NULL)			break;		printk(KERN_INFO "EEH: MMIO failure (%d), notifiying device "		       "%s\n", event->reset_state,		       pci_name(event->dev));		atomic_set(&eeh_fail_count, 0);		notifier_call_chain (&eeh_notifier_chain,				     EEH_NOTIFY_FREEZE, event);		__get_cpu_var(slot_resets)++;		pci_dev_put(event->dev);		kfree(event);	}}/** * eeh_token_to_phys - convert EEH address token to phys address * @token i/o token, should be address in the form 0xE.... */static inline unsigned long eeh_token_to_phys(unsigned long token){	pte_t *ptep;	unsigned long pa;	ptep = find_linux_pte(init_mm.pgd, token);	if (!ptep)		return token;	pa = pte_pfn(*ptep) << PAGE_SHIFT;	return pa | (token & (PAGE_SIZE-1));}/** * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze * @dn device node * @dev pci device, if known * * Check for an EEH failure for the given device node.  Call this * routine if the result of a read was all 0xff's and you want to * find out if this is due to an EEH slot freeze.  This routine * will query firmware for the EEH status. * * Returns 0 if there has not been an EEH error; otherwise returns * a non-zero value and queues up a solt isolation event notification. * * It is safe to call this routine in an interrupt context. */int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev){	int ret;	int rets[3];	unsigned long flags;	int rc, reset_state;	struct eeh_event  *event;	struct pci_dn *pdn;	__get_cpu_var(total_mmio_ffs)++;	if (!eeh_subsystem_enabled)		return 0;	if (!dn)		return 0;	pdn = dn->data;	/* Access to IO BARs might get this far and still not want checking. */	if (!pdn->eeh_capable || !(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||	    pdn->eeh_mode & EEH_MODE_NOCHECK) {		return 0;	}	if (!pdn->eeh_config_addr) {		return 0;	}	/*	 * If we already have a pending isolation event for this	 * slot, we know it's bad already, we don't need to check...	 */	if (pdn->eeh_mode & EEH_MODE_ISOLATED) {		atomic_inc(&eeh_fail_count);		if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {			/* re-read the slot reset state */			if (read_slot_reset_state(dn, rets) != 0)				rets[0] = -1;	/* reset state unknown */			eeh_panic(dev, rets[0]);		}		return 0;	}	/*	 * Now test for an EEH failure.  This is VERY expensive.	 * Note that the eeh_config_addr may be a parent device	 * in the case of a device behind a bridge, or it may be	 * function zero of a multi-function device.	 * In any case they must share a common PHB.	 */	ret = read_slot_reset_state(dn, rets);	if (!(ret == 0 && rets[1] == 1 && (rets[0] == 2 || rets[0] == 4))) {		__get_cpu_var(false_positives)++;		return 0;	}	/* prevent repeated reports of this failure */	pdn->eeh_mode |= EEH_MODE_ISOLATED;	reset_state = rets[0];	spin_lock_irqsave(&slot_errbuf_lock, flags);	memset(slot_errbuf, 0, eeh_error_buf_size);	rc = rtas_call(ibm_slot_error_detail,	               8, 1, NULL, pdn->eeh_config_addr,	               BUID_HI(pdn->phb->buid),	               BUID_LO(pdn->phb->buid), NULL, 0,	               virt_to_phys(slot_errbuf),	               eeh_error_buf_size,	               1 /* Temporary Error */);	if (rc == 0)		log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);	spin_unlock_irqrestore(&slot_errbuf_lock, flags);	printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n",	       rets[0], dn->name, dn->full_name);	event = kmalloc(sizeof(*event), GFP_ATOMIC);	if (event == NULL) {		eeh_panic(dev, reset_state);		return 1; 	}	event->dev = dev;	event->dn = dn;	event->reset_state = reset_state;	/* We may or may not be called in an interrupt context */	spin_lock_irqsave(&eeh_eventlist_lock, flags);	list_add(&event->list, &eeh_eventlist);	spin_unlock_irqrestore(&eeh_eventlist_lock, flags);	/* Most EEH events are due to device driver bugs.  Having	 * a stack trace will help the device-driver authors figure	 * out what happened.  So print that out. */	dump_stack();	schedule_work(&eeh_event_wq);	return 0;}EXPORT_SYMBOL(eeh_dn_check_failure);/** * eeh_check_failure - check if all 1's data is due to EEH slot freeze * @token i/o token, should be address in the form 0xA.... * @val value, should be all 1's (XXX why do we need this arg??) * * Check for an eeh failure at the given token address. * Check for an EEH failure at the given token address.  Call this * routine if the result of a read was all 0xff's and you want to * find out if this is due to an EEH slot freeze event.  This routine * will query firmware for the EEH status. * * Note this routine is safe to call in an interrupt context. */unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val){	unsigned long addr;	struct pci_dev *dev;	struct device_node *dn;	/* Finding the phys addr + pci device; this is pretty quick. */	addr = eeh_token_to_phys((unsigned long __force) token);	dev = pci_get_device_by_addr(addr);	if (!dev)		return val;	dn = pci_device_to_OF_node(dev);	eeh_dn_check_failure (dn, dev);	pci_dev_put(dev);	return val;}EXPORT_SYMBOL(eeh_check_failure);struct eeh_early_enable_info {	unsigned int buid_hi;	unsigned int buid_lo;};/* Enable eeh for the given device node. */static void *early_enable_eeh(struct device_node *dn, void *data){	struct eeh_early_enable_info *info = data;	int ret;	char *status = get_property(dn, "status", NULL);	u32 *class_code = (u32 *)get_property(dn, "class-code", NULL);	u32 *vendor_id = (u32 *)get_property(dn, "vendor-id", NULL);	u32 *device_id = (u32 *)get_property(dn, "device-id", NULL);	u32 *regs;	int enable;	struct pci_dn *pdn = dn->data;	pdn->eeh_mode = 0;	if (status && strcmp(status, "ok") != 0)		return NULL;	/* ignore devices with bad status */	/* Ignore bad nodes. */	if (!class_code || !vendor_id || !device_id)		return NULL;	/* There is nothing to check on PCI to ISA bridges */	if (dn->type && !strcmp(dn->type, "isa")) {		pdn->eeh_mode |= EEH_MODE_NOCHECK;		return NULL;	}	/*	 * Now decide if we are going to "Disable" EEH checking	 * for this device.  We still run with the EEH hardware active,	 * but we won't be checking for ff's.  This means a driver	 * could return bad data (very bad!), an interrupt handler could	 * hang waiting on status bits that won't change, etc.	 * But there are a few cases like display devices that make sense.	 */	enable = 1;	/* i.e. we will do checking */	if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY)		enable = 0;	if (!enable)		pdn->eeh_mode |= EEH_MODE_NOCHECK;	/* Ok... see if this device supports EEH.  Some do, some don't,	 * and the only way to find out is to check each and every one. */	regs = (u32 *)get_property(dn, "reg", NULL);	if (regs) {		/* First register entry is addr (00BBSS00)  */		/* Try to enable eeh */		ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,				regs[0], info->buid_hi, info->buid_lo,				EEH_ENABLE);		if (ret == 0) {			eeh_subsystem_enabled = 1;			pdn->eeh_mode |= EEH_MODE_SUPPORTED;			pdn->eeh_config_addr = regs[0];#ifdef DEBUG			printk(KERN_DEBUG "EEH: %s: eeh enabled\n", dn->full_name);#endif		} else {			/* This device doesn't support EEH, but it may have an			 * EEH parent, in which case we mark it as supported. */			if (dn->parent && dn->parent->data			    && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {				/* Parent supports EEH. */				pdn->eeh_mode |= EEH_MODE_SUPPORTED;				pdn->eeh_config_addr = PCI_DN(dn->parent)->eeh_config_addr;				return NULL;			}		}	} else {		printk(KERN_WARNING "EEH: %s: unable to get reg property.\n",		       dn->full_name);	}	return NULL; }/* * Initialize EEH by trying to enable it for all of the adapters in the system. * As a side effect we can determine here if eeh is supported at all. * Note that we leave EEH on so failed config cycles won't cause a machine * check.  If a user turns off EEH for a particular adapter they are really * telling Linux to ignore errors.  Some hardware (e.g. POWER5) won't * grant access to a slot if EEH isn't enabled, and so we always enable * EEH for all slots/all devices. * * The eeh-force-off option disables EEH checking globally, for all slots. * Even if force-off is set, the EEH hardware is still enabled, so that * newer systems can boot. */void __init eeh_init(void){	struct device_node *phb, *np;	struct eeh_early_enable_info info;	np = of_find_node_by_path("/rtas");	if (np == NULL)		return;	ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");	ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");	ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");	ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");	ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");	if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE)		return;	eeh_error_buf_size = rtas_token("rtas-error-log-max");	if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {		eeh_error_buf_size = 1024;	}	if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {		printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated "		      "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX);		eeh_error_buf_size = RTAS_ERROR_LOG_MAX;	}	/* Enable EEH for all adapters.  Note that eeh requires buid's */	for (phb = of_find_node_by_name(NULL, "pci"); phb;	     phb = of_find_node_by_name(phb, "pci")) {		unsigned long buid;		struct pci_dn *pci;		buid = get_phb_buid(phb);		if (buid == 0 || phb->data == NULL)			continue;		pci = phb->data;		info.buid_lo = BUID_LO(buid);		info.buid_hi = BUID_HI(buid);		traverse_pci_devices(phb, early_enable_eeh, &info);	}	if (eeh_subsystem_enabled)		printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n");	else		printk(KERN_WARNING "EEH: No capable adapters found\n");}/** * eeh_add_device_early - enable EEH for the indicated device_node * @dn: device node for which to set up EEH * * This routine must be used to perform EEH initialization for PCI * devices that were added after system boot (e.g. hotplug, dlpar). * This routine must be called before any i/o is performed to the * adapter (inluding any config-space i/o). * Whether this actually enables EEH or not for this device depends * on the CEC architecture, type of the device, on earlier boot * command-line arguments & etc. */void eeh_add_device_early(struct device_node *dn){	struct pci_controller *phb;	struct eeh_early_enable_info info;	if (!dn || !dn->data)		return;	phb = PCI_DN(dn)->phb;	if (NULL == phb || 0 == phb->buid) {		printk(KERN_WARNING "EEH: Expected buid but found none\n");		return;	}	info.buid_hi = BUID_HI(phb->buid);	info.buid_lo = BUID_LO(phb->buid);	early_enable_eeh(dn, &info);}EXPORT_SYMBOL(eeh_add_device_early);/** * eeh_add_device_late - perform EEH initialization for the indicated pci device * @dev: pci device for which to set up EEH * * This routine must be used to complete EEH initialization for PCI * devices that were added after system boot (e.g. hotplug, dlpar). */void eeh_add_device_late(struct pci_dev *dev){	if (!dev || !eeh_subsystem_enabled)		return;#ifdef DEBUG	printk(KERN_DEBUG "EEH: adding device %s\n", pci_name(dev));#endif	pci_addr_cache_insert_device (dev);}EXPORT_SYMBOL(eeh_add_device_late);/** * eeh_remove_device - undo EEH setup for the indicated pci device * @dev: pci device to be removed * * This routine should be when a device is removed from a running * system (e.g. by hotplug or dlpar). */void eeh_remove_device(struct pci_dev *dev){	if (!dev || !eeh_subsystem_enabled)		return;	/* Unregister the device with the EEH/PCI address search system */#ifdef DEBUG	printk(KERN_DEBUG "EEH: remove device %s\n", pci_name(dev));#endif	pci_addr_cache_remove_device(dev);}EXPORT_SYMBOL(eeh_remove_device);static int proc_eeh_show(struct seq_file *m, void *v){	unsigned int cpu;	unsigned long ffs = 0, positives = 0, failures = 0;	unsigned long resets = 0;	for_each_cpu(cpu) {		ffs += per_cpu(total_mmio_ffs, cpu);		positives += per_cpu(false_positives, cpu);		failures += per_cpu(ignored_failures, cpu);		resets += per_cpu(slot_resets, cpu);	}	if (0 == eeh_subsystem_enabled) {		seq_printf(m, "EEH Subsystem is globally disabled\n");		seq_printf(m, "eeh_total_mmio_ffs=%ld\n", ffs);	} else {		seq_printf(m, "EEH Subsystem is enabled\n");		seq_printf(m, "eeh_total_mmio_ffs=%ld\n"			   "eeh_false_positives=%ld\n"			   "eeh_ignored_failures=%ld\n"			   "eeh_slot_resets=%ld\n"				"eeh_fail_count=%d\n",			   ffs, positives, failures, resets,				eeh_fail_count.counter);	}	return 0;}static int proc_eeh_open(struct inode *inode, struct file *file){	return single_open(file, proc_eeh_show, NULL);}static struct file_operations proc_eeh_operations = {	.open      = proc_eeh_open,	.read      = seq_read,	.llseek    = seq_lseek,	.release   = single_release,};static int __init eeh_init_proc(void){	struct proc_dir_entry *e;	if (systemcfg->platform & PLATFORM_PSERIES) {		e = create_proc_entry("ppc64/eeh", 0, NULL);		if (e)			e->proc_fops = &proc_eeh_operations;	}	return 0;}__initcall(eeh_init_proc);

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?