📄 mca.c
字号:
* are in the current RBS, copy them back to the original stack. The * copy must be done register by register because the original bspstore * and the current one have different alignments, so the saved RNAT * data occurs at different places. * * mca_asm does cover, so the old_bsp already includes all registers at * the time of MCA/INIT. It also does flushrs, so all registers before * this function have been written to backing store on the MCA/INIT * stack. */ new_rnat = ia64_get_rnat(ia64_rse_rnat_addr(new_bspstore)); old_rnat = regs->ar_rnat; while (slots--) { if (ia64_rse_is_rnat_slot(new_bspstore)) { new_rnat = ia64_get_rnat(new_bspstore++); } if (ia64_rse_is_rnat_slot(old_bspstore)) { *old_bspstore++ = old_rnat; old_rnat = 0; } nat = (new_rnat >> ia64_rse_slot_num(new_bspstore)) & 1UL; old_rnat &= ~(1UL << ia64_rse_slot_num(old_bspstore)); old_rnat |= (nat << ia64_rse_slot_num(old_bspstore)); *old_bspstore++ = *new_bspstore++; } old_sw->ar_bspstore = (unsigned long)old_bspstore; old_sw->ar_rnat = old_rnat; sos->prev_task = previous_current; return previous_current;no_mod: printk(KERN_INFO "cpu %d, %s %s, original stack not modified\n", smp_processor_id(), type, msg); return previous_current;}/* The monarch/slave interaction is based on monarch_cpu and requires that all * slaves have entered rendezvous before the monarch leaves. If any cpu has * not entered rendezvous yet then wait a bit. The assumption is that any * slave that has not rendezvoused after a reasonable time is never going to do * so. In this context, slave includes cpus that respond to the MCA rendezvous * interrupt, as well as cpus that receive the INIT slave event. */static voidia64_wait_for_slaves(int monarch){ int c, wait = 0; for_each_online_cpu(c) { if (c == monarch) continue; if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { udelay(1000); /* short wait first */ wait = 1; break; } } if (!wait) return; for_each_online_cpu(c) { if (c == monarch) continue; if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { udelay(5*1000000); /* wait 5 seconds for slaves (arbitrary) */ break; } }}/* * ia64_mca_handler * * This is uncorrectable machine check handler called from OS_MCA * dispatch code which is in turn called from SAL_CHECK(). * This is the place where the core of OS MCA handling is done. * Right now the logs are extracted and displayed in a well-defined * format. This handler code is supposed to be run only on the * monarch processor. Once the monarch is done with MCA handling * further MCA logging is enabled by clearing logs. * Monarch also has the duty of sending wakeup-IPIs to pull the * slave processors out of rendezvous spinloop. */voidia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, struct ia64_sal_os_state *sos){ pal_processor_state_info_t *psp = (pal_processor_state_info_t *) &sos->proc_state_param; int recover, cpu = smp_processor_id(); task_t *previous_current; oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */ previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA"); monarch_cpu = cpu; if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, 0, 0, 0) == NOTIFY_STOP) ia64_mca_spin(__FUNCTION__); ia64_wait_for_slaves(cpu); /* Wakeup all the processors which are spinning in the rendezvous loop. * They will leave SAL, then spin in the OS with interrupts disabled * until this monarch cpu leaves the MCA handler. That gets control * back to the OS so we can backtrace the other cpus, backtrace when * spinning in SAL does not work. */ ia64_mca_wakeup_all(); if (notify_die(DIE_MCA_MONARCH_PROCESS, "MCA", regs, 0, 0, 0) == NOTIFY_STOP) ia64_mca_spin(__FUNCTION__); /* Get the MCA error record and log it */ ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA); /* TLB error is only exist in this SAL error record */ recover = (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc)) /* other error recovery */ || (ia64_mca_ucmc_extension && ia64_mca_ucmc_extension( IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA), sos)); if (recover) { sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA); rh->severity = sal_log_severity_corrected; ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA); sos->os_status = IA64_MCA_CORRECTED; } if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, 0, 0, recover) == NOTIFY_STOP) ia64_mca_spin(__FUNCTION__); set_curr_task(cpu, previous_current); monarch_cpu = -1;}static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd, NULL);static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd, NULL);/* * ia64_mca_cmc_int_handler * * This is corrected machine check interrupt handler. * Right now the logs are extracted and displayed in a well-defined * format. * * Inputs * interrupt number * client data arg ptr * saved registers ptr * * Outputs * None */static irqreturn_tia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs){ static unsigned long cmc_history[CMC_HISTORY_LENGTH]; static int index; static DEFINE_SPINLOCK(cmc_history_lock); IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n", __FUNCTION__, cmc_irq, smp_processor_id()); /* SAL spec states this should run w/ interrupts enabled */ local_irq_enable(); /* Get the CMC error record and log it */ ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC); spin_lock(&cmc_history_lock); if (!cmc_polling_enabled) { int i, count = 1; /* we know 1 happened now */ unsigned long now = jiffies; for (i = 0; i < CMC_HISTORY_LENGTH; i++) { if (now - cmc_history[i] <= HZ) count++; } IA64_MCA_DEBUG(KERN_INFO "CMC threshold %d/%d\n", count, CMC_HISTORY_LENGTH); if (count >= CMC_HISTORY_LENGTH) { cmc_polling_enabled = 1; spin_unlock(&cmc_history_lock); /* If we're being hit with CMC interrupts, we won't * ever execute the schedule_work() below. Need to * disable CMC interrupts on this processor now. */ ia64_mca_cmc_vector_disable(NULL); schedule_work(&cmc_disable_work); /* * Corrected errors will still be corrected, but * make sure there's a log somewhere that indicates * something is generating more than we can handle. */ printk(KERN_WARNING "WARNING: Switching to polling CMC handler; error records may be lost\n"); mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL); /* lock already released, get out now */ return IRQ_HANDLED; } else { cmc_history[index++] = now; if (index == CMC_HISTORY_LENGTH) index = 0; } } spin_unlock(&cmc_history_lock); return IRQ_HANDLED;}/* * ia64_mca_cmc_int_caller * * Triggered by sw interrupt from CMC polling routine. Calls * real interrupt handler and either triggers a sw interrupt * on the next cpu or does cleanup at the end. * * Inputs * interrupt number * client data arg ptr * saved registers ptr * Outputs * handled */static irqreturn_tia64_mca_cmc_int_caller(int cmc_irq, void *arg, struct pt_regs *ptregs){ static int start_count = -1; unsigned int cpuid; cpuid = smp_processor_id(); /* If first cpu, update count */ if (start_count == -1) start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC); ia64_mca_cmc_int_handler(cmc_irq, arg, ptregs); for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); if (cpuid < NR_CPUS) { platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); } else { /* If no log record, switch out of polling mode */ if (start_count == IA64_LOG_COUNT(SAL_INFO_TYPE_CMC)) { printk(KERN_WARNING "Returning to interrupt driven CMC handler\n"); schedule_work(&cmc_enable_work); cmc_polling_enabled = 0; } else { mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL); } start_count = -1; } return IRQ_HANDLED;}/* * ia64_mca_cmc_poll * * Poll for Corrected Machine Checks (CMCs) * * Inputs : dummy(unused) * Outputs : None * */static voidia64_mca_cmc_poll (unsigned long dummy){ /* Trigger a CMC interrupt cascade */ platform_send_ipi(first_cpu(cpu_online_map), IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);}/* * ia64_mca_cpe_int_caller * * Triggered by sw interrupt from CPE polling routine. Calls * real interrupt handler and either triggers a sw interrupt * on the next cpu or does cleanup at the end. * * Inputs * interrupt number * client data arg ptr * saved registers ptr * Outputs * handled */#ifdef CONFIG_ACPIstatic irqreturn_tia64_mca_cpe_int_caller(int cpe_irq, void *arg, struct pt_regs *ptregs){ static int start_count = -1; static int poll_time = MIN_CPE_POLL_INTERVAL; unsigned int cpuid; cpuid = smp_processor_id(); /* If first cpu, update count */ if (start_count == -1) start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CPE); ia64_mca_cpe_int_handler(cpe_irq, arg, ptregs); for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); if (cpuid < NR_CPUS) { platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); } else { /* * If a log was recorded, increase our polling frequency, * otherwise, backoff or return to interrupt mode. */ if (start_count != IA64_LOG_COUNT(SAL_INFO_TYPE_CPE)) { poll_time = max(MIN_CPE_POLL_INTERVAL, poll_time / 2); } else if (cpe_vector < 0) { poll_time = min(MAX_CPE_POLL_INTERVAL, poll_time * 2); } else { poll_time = MIN_CPE_POLL_INTERVAL; printk(KERN_WARNING "Returning to interrupt driven CPE handler\n"); enable_irq(local_vector_to_irq(IA64_CPE_VECTOR)); cpe_poll_enabled = 0; } if (cpe_poll_enabled) mod_timer(&cpe_poll_timer, jiffies + poll_time); start_count = -1; } return IRQ_HANDLED;}/* * ia64_mca_cpe_poll * * Poll for Corrected Platform Errors (CPEs), trigger interrupt * on first cpu, from there it will trickle through all the cpus. * * Inputs : dummy(unused) * Outputs : None * */static voidia64_mca_cpe_poll (unsigned long dummy){ /* Trigger a CPE interrupt cascade */ platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);}#endif /* CONFIG_ACPI */static intdefault_monarch_init_process(struct notifier_block *self, unsigned long val, void *data){ int c; struct task_struct *g, *t; if (val != DIE_INIT_MONARCH_PROCESS) return NOTIFY_DONE; printk(KERN_ERR "Processes interrupted by INIT -"); for_each_online_cpu(c) { struct ia64_sal_os_state *s; t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET); s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET); g = s->prev_task; if (g) { if (g->pid) printk(" %d", g->pid); else printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g); } } printk("\n\n"); if (read_trylock(&tasklist_lock)) { do_each_thread (g, t) { printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); show_stack(t, NULL); } while_each_thread (g, t); read_unlock(&tasklist_lock); } return NOTIFY_DONE;}/* * C portion of the OS INIT handler * * Called from ia64_os_init_dispatch * * Inputs: pointer to pt_regs where processor info was saved. SAL/OS state for * this event. This code is used for both monarch and slave INIT events, see * sos->monarch. * * All INIT events switch to the INIT stack and change the previous process to * blocked status. If one of the INIT events is the monarch then we are * probably processing the nmi button/command. Use the monarch cpu to dump all * the processes. The slave INIT events all spin until the monarch cpu * returns. We can also get INIT slave events for MCA, in which case the MCA * process is the monarch. */voidia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, struct ia64_sal_os_state *sos){ static atomic_t slaves; static atomic_t monarchs; task_t *previous_current; int cpu = smp_processor_id(); oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */ console_loglevel = 15; /* make sure printks make it to console */ printk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n", sos->proc_state_param, cpu, sos->monarch); salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0); previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "INIT"); sos->os_status = IA64_INIT_RESUME; /* FIXME: Workaround for broken proms that drive all INIT events as * slaves. The last slave that enters is promoted to be a monarch. * Remove this code in September 2006, that gives platforms a year to * fix their proms and get their customers updated. */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -