📄 mce_64.c
字号:
/* * Machine check handler. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. */#include <linux/init.h>#include <linux/types.h>#include <linux/kernel.h>#include <linux/sched.h>#include <linux/string.h>#include <linux/rcupdate.h>#include <linux/kallsyms.h>#include <linux/sysdev.h>#include <linux/miscdevice.h>#include <linux/fs.h>#include <linux/capability.h>#include <linux/cpu.h>#include <linux/percpu.h>#include <linux/poll.h>#include <linux/thread_info.h>#include <linux/ctype.h>#include <linux/kmod.h>#include <linux/kdebug.h>#include <asm/processor.h>#include <asm/msr.h>#include <asm/mce.h>#include <asm/uaccess.h>#include <asm/smp.h>#include <asm/idle.h>#define MISC_MCELOG_MINOR 227#define NR_BANKS 6atomic_t mce_entry;static int mce_dont_init;/* * Tolerant levels: * 0: always panic on uncorrected errors, log corrected errors * 1: panic or SIGBUS on uncorrected errors, log corrected errors * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors * 3: never panic or SIGBUS, log all errors (for testing only) */static int tolerant = 1;static int banks;static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };static unsigned long notify_user;static int rip_msr;static int mce_bootlog = 1;static atomic_t mce_events;static char trigger[128];static char *trigger_argv[2] = { trigger, NULL };static DECLARE_WAIT_QUEUE_HEAD(mce_wait);/* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also * separate MCEs from kernel messages to avoid bogus bug reports. */struct mce_log mcelog = { MCE_LOG_SIGNATURE, MCE_LOG_LEN,};void mce_log(struct mce *mce){ unsigned next, entry; atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { entry = rcu_dereference(mcelog.next); for (;;) { /* When the buffer fills up discard new entries. Assume that the earlier errors are the more interesting. */ if (entry >= MCE_LOG_LEN) { set_bit(MCE_OVERFLOW, &mcelog.flags); return; } /* Old left over entry. Skip. */ if (mcelog.entry[entry].finished) { entry++; continue; } break; } smp_rmb(); next = entry + 1; if (cmpxchg(&mcelog.next, entry, next) == entry) break; } memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); wmb(); mcelog.entry[entry].finished = 1; wmb(); set_bit(0, ¬ify_user);}static void print_mce(struct mce *m){ printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n" KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->cpu, m->mcgstatus, m->bank, m->status); if (m->rip) { printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", m->cs, m->rip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->rip); printk("\n"); } printk(KERN_EMERG "TSC %Lx ", m->tsc); if (m->addr) printk("ADDR %Lx ", m->addr); if (m->misc) printk("MISC %Lx ", m->misc); printk("\n"); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " "and contact your hardware vendor\n");}static void mce_panic(char *msg, struct mce *backup, unsigned long start){ int i; oops_begin(); for (i = 0; i < MCE_LOG_LEN; i++) { unsigned long tsc = mcelog.entry[i].tsc; if (time_before(tsc, start)) continue; print_mce(&mcelog.entry[i]); if (backup && mcelog.entry[i].tsc == backup->tsc) backup = NULL; } if (backup) print_mce(backup); panic(msg);}static int mce_available(struct cpuinfo_x86 *c){ return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);}static inline void mce_get_rip(struct mce *m, struct pt_regs *regs){ if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { m->rip = regs->rip; m->cs = regs->cs; } else { m->rip = 0; m->cs = 0; } if (rip_msr) { /* Assume the RIP in the MSR is exact. Is this true? */ m->mcgstatus |= MCG_STATUS_EIPV; rdmsrl(rip_msr, m->rip); m->cs = 0; }}/* * The actual machine check handler */void do_machine_check(struct pt_regs * regs, long error_code){ struct mce m, panicm; u64 mcestart = 0; int i; int panicm_found = 0; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; /* * If kill_it gets set, there might be a way to recover from this * error. */ int kill_it = 0; atomic_inc(&mce_entry); if (regs) notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); if (!banks) goto out2; memset(&m, 0, sizeof(struct mce)); m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) no_way_out = 1; rdtscll(mcestart); barrier(); for (i = 0; i < banks; i++) { if (!bank[i]) continue; m.misc = 0; m.addr = 0; m.bank = i; m.tsc = 0; rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); if ((m.status & MCI_STATUS_VAL) == 0) continue; if (m.status & MCI_STATUS_EN) { /* if PCC was set, there's no way out */ no_way_out |= !!(m.status & MCI_STATUS_PCC); /* * If this error was uncorrectable and there was * an overflow, we're in trouble. If no overflow, * we might get away with just killing a task. */ if (m.status & MCI_STATUS_UC) { if (tolerant < 1 || m.status & MCI_STATUS_OVER) no_way_out = 1; kill_it = 1; } } if (m.status & MCI_STATUS_MISCV) rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); if (m.status & MCI_STATUS_ADDRV) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); if (error_code >= 0) rdtscll(m.tsc); if (error_code != -2) mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, and that there is only a single one. */ if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { panicm = m; panicm_found = 1; } add_taint(TAINT_MACHINE_CHECK); } /* Never do anything final in the polling timer */ if (!regs) goto out; /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) panicm = m; /* * If we have decided that we just CAN'T continue, and the user * has not set tolerant to an insane level, give up and die. */ if (no_way_out && tolerant < 3) mce_panic("Machine check", &panicm, mcestart); /* * If the error seems to be unrecoverable, something should be * done. Try to kill as little as possible. If we can kill just * one task, do that. If the user has set the tolerance very * high, don't try to do anything at all. */ if (kill_it && tolerant < 3) { int user_space = 0; /* * If the EIPV bit is set, it means the saved IP is the * instruction which caused the MCE. */ if (m.mcgstatus & MCG_STATUS_EIPV) user_space = panicm.rip && (panicm.cs & 3); /* * If we know that the error was in user space, send a * SIGBUS. Otherwise, panic if tolerance is low. * * do_exit() takes an awful lot of locks and has a slight * risk of deadlocking. */ if (user_space) { do_exit(SIGBUS); } else if (panic_on_oops || tolerant < 2) { mce_panic("Uncorrected machine check", &panicm, mcestart); } } /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); out: /* the last thing we do is clear state */ for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry);}#ifdef CONFIG_X86_MCE_INTEL/*** * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog * @cpu: The CPU on which the event occurred. * @status: Event status information * * This function should be called by the thermal interrupt after the * event has been processed and the decision was made to log the event * further. * * The status parameter will be saved to the 'status' field of 'struct mce' * and historically has been the register value of the * MSR_IA32_THERMAL_STATUS (Intel) msr. */void mce_log_therm_throt_event(unsigned int cpu, __u64 status){ struct mce m; memset(&m, 0, sizeof(m)); m.cpu = cpu; m.bank = MCE_THERMAL_BANK; m.status = status; rdtscll(m.tsc); mce_log(&m);}#endif /* CONFIG_X86_MCE_INTEL *//* * Periodic polling timer for "silent" machine check errors. If the * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */static int check_interval = 5 * 60; /* 5 minutes */static int next_interval; /* in jiffies */static void mcheck_timer(struct work_struct *work);static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);static void mcheck_check_cpu(void *info){ if (mce_available(¤t_cpu_data)) do_machine_check(NULL, 0);}static void mcheck_timer(struct work_struct *work){ on_each_cpu(mcheck_check_cpu, NULL, 1, 1); /* * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ if (mce_notify_user()) { next_interval = max(next_interval/2, HZ/100); } else { next_interval = min(next_interval * 2, (int)round_jiffies_relative(check_interval*HZ)); } schedule_delayed_work(&mcheck_work, next_interval);}/* * This is only called from process context. This is where we do * anything we need to alert userspace about new MCEs. This is called * directly from the poller and also from entry.S and idle, thanks to * TIF_MCE_NOTIFY. */int mce_notify_user(void){ clear_thread_flag(TIF_MCE_NOTIFY); if (test_and_clear_bit(0, ¬ify_user)) { static unsigned long last_print; unsigned long now = jiffies; wake_up_interruptible(&mce_wait); if (trigger[0]) call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); if (time_after_eq(now, last_print + (check_interval*HZ))) { last_print = now; printk(KERN_INFO "Machine check events logged\n"); } return 1; } return 0;}/* see if the idle task needs to notify userspace */static intmce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk){ /* IDLE_END should be safe - interrupts are back on */ if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) mce_notify_user(); return NOTIFY_OK;}static struct notifier_block mce_idle_notifier = { .notifier_call = mce_idle_callback,};static __init int periodic_mcheck_init(void){ next_interval = check_interval * HZ; if (next_interval) schedule_delayed_work(&mcheck_work, round_jiffies_relative(next_interval)); idle_notifier_register(&mce_idle_notifier); return 0;}__initcall(periodic_mcheck_init);/* * Initialize Machine Checks for a CPU. */static void mce_init(void *dummy){ u64 cap; int i; rdmsrl(MSR_IA32_MCG_CAP, cap); banks = cap & 0xff; if (banks > NR_BANKS) { printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); banks = NR_BANKS; } /* Use accurate RIP reporting if available. */ if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -