📄 fault.c

📁 linux-2.6.15.6
💻 C
字号:
/* *  linux/arch/i386/mm/fault.c * *  Copyright (C) 1995  Linus Torvalds */#include <linux/signal.h>#include <linux/sched.h>#include <linux/kernel.h>#include <linux/errno.h>#include <linux/string.h>#include <linux/types.h>#include <linux/ptrace.h>#include <linux/mman.h>#include <linux/mm.h>#include <linux/smp.h>#include <linux/smp_lock.h>#include <linux/interrupt.h>#include <linux/init.h>#include <linux/tty.h>#include <linux/vt_kern.h>		/* For unblank_screen() */#include <linux/highmem.h>#include <linux/module.h>#include <linux/kprobes.h>#include <asm/system.h>#include <asm/uaccess.h>#include <asm/desc.h>#include <asm/kdebug.h>extern void die(const char *,struct pt_regs *,long);/* * Unlock any spinlocks which will prevent us from getting the * message out  */void bust_spinlocks(int yes){	int loglevel_save = console_loglevel;	if (yes) {		oops_in_progress = 1;		return;	}#ifdef CONFIG_VT	unblank_screen();#endif	oops_in_progress = 0;	/*	 * OK, the message is on the console.  Now we call printk()	 * without oops_in_progress set so that printk will give klogd	 * a poke.  Hold onto your hats...	 */	console_loglevel = 15;		/* NMI oopser may have shut the console up */	printk(" ");	console_loglevel = loglevel_save;}/* * Return EIP plus the CS segment base.  The segment limit is also * adjusted, clamped to the kernel/user address space (whichever is * appropriate), and returned in *eip_limit. * * The segment is checked, because it might have been changed by another * task between the original faulting instruction and here. * * If CS is no longer a valid code segment, or if EIP is beyond the * limit, or if it is a kernel address when CS is not a kernel segment, * then the returned value will be greater than *eip_limit. *  * This is slow, but is very rarely executed. */static inline unsigned long get_segment_eip(struct pt_regs *regs,					    unsigned long *eip_limit){	unsigned long eip = regs->eip;	unsigned seg = regs->xcs & 0xffff;	u32 seg_ar, seg_limit, base, *desc;	/* The standard kernel/user address space limit. */	*eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg;	/* Unlikely, but must come before segment checks. */	if (unlikely((regs->eflags & VM_MASK) != 0))		return eip + (seg << 4);		/* By far the most common cases. */	if (likely(seg == __USER_CS || seg == __KERNEL_CS))		return eip;	/* Check the segment exists, is within the current LDT/GDT size,	   that kernel/user (ring 0..3) has the appropriate privilege,	   that it's a code segment, and get the limit. */	__asm__ ("larl %3,%0; lsll %3,%1"		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));	if ((~seg_ar & 0x9800) || eip > seg_limit) {		*eip_limit = 0;		return 1;	 /* So that returned eip > *eip_limit. */	}	/* Get the GDT/LDT descriptor base. 	   When you look for races in this code remember that	   LDT and other horrors are only used in user space. */	if (seg & (1<<2)) {		/* Must lock the LDT while reading it. */		down(&current->mm->context.sem);		desc = current->mm->context.ldt;		desc = (void *)desc + (seg & ~7);	} else {		/* Must disable preemption while reading the GDT. */ 		desc = (u32 *)get_cpu_gdt_table(get_cpu());		desc = (void *)desc + (seg & ~7);	}	/* Decode the code segment base from the descriptor */	base = get_desc_base((unsigned long *)desc);	if (seg & (1<<2)) { 		up(&current->mm->context.sem);	} else		put_cpu();	/* Adjust EIP and segment limit, and clamp at the kernel limit.	   It's legitimate for segments to wrap at 0xffffffff. */	seg_limit += base;	if (seg_limit < *eip_limit && seg_limit >= base)		*eip_limit = seg_limit;	return eip + base;}/*  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. * Check that here and ignore it. */static int __is_prefetch(struct pt_regs *regs, unsigned long addr){ 	unsigned long limit;	unsigned long instr = get_segment_eip (regs, &limit);	int scan_more = 1;	int prefetch = 0; 	int i;	for (i = 0; scan_more && i < 15; i++) { 		unsigned char opcode;		unsigned char instr_hi;		unsigned char instr_lo;		if (instr > limit)			break;		if (__get_user(opcode, (unsigned char __user *) instr))			break; 		instr_hi = opcode & 0xf0; 		instr_lo = opcode & 0x0f; 		instr++;		switch (instr_hi) { 		case 0x20:		case 0x30:			/* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */			scan_more = ((instr_lo & 7) == 0x6);			break;					case 0x60:			/* 0x64 thru 0x67 are valid prefixes in all modes. */			scan_more = (instr_lo & 0xC) == 0x4;			break;				case 0xF0:			/* 0xF0, 0xF2, and 0xF3 are valid prefixes */			scan_more = !instr_lo || (instr_lo>>1) == 1;			break;					case 0x00:			/* Prefetch instruction is 0x0F0D or 0x0F18 */			scan_more = 0;			if (instr > limit)				break;			if (__get_user(opcode, (unsigned char __user *) instr))				break;			prefetch = (instr_lo == 0xF) &&				(opcode == 0x0D || opcode == 0x18);			break;					default:			scan_more = 0;			break;		} 	}	return prefetch;}static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,			      unsigned long error_code){	if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&		     boot_cpu_data.x86 >= 6)) {		/* Catch an obscure case of prefetch inside an NX page. */		if (nx_enabled && (error_code & 16))			return 0;		return __is_prefetch(regs, addr);	}	return 0;} static noinline void force_sig_info_fault(int si_signo, int si_code,	unsigned long address, struct task_struct *tsk){	siginfo_t info;	info.si_signo = si_signo;	info.si_errno = 0;	info.si_code = si_code;	info.si_addr = (void __user *)address;	force_sig_info(si_signo, &info, tsk);}fastcall void do_invalid_op(struct pt_regs *, unsigned long);/* * This routine handles page faults.  It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. * * error_code: *	bit 0 == 0 means no page found, 1 means protection fault *	bit 1 == 0 means read, 1 means write *	bit 2 == 0 means kernel, 1 means user-mode */fastcall void __kprobes do_page_fault(struct pt_regs *regs,				      unsigned long error_code){	struct task_struct *tsk;	struct mm_struct *mm;	struct vm_area_struct * vma;	unsigned long address;	unsigned long page;	int write, si_code;	/* get the address */        address = read_cr2();	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,					SIGSEGV) == NOTIFY_STOP)		return;	/* It's safe to allow irq's after cr2 has been saved */	if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))		local_irq_enable();	tsk = current;	si_code = SEGV_MAPERR;	/*	 * We fault-in kernel-space virtual memory on-demand. The	 * 'reference' page table is init_mm.pgd.	 *	 * NOTE! We MUST NOT take any locks for this case. We may	 * be in an interrupt or a critical region, and should	 * only copy the information from the master page table,	 * nothing more.	 *	 * This verifies that the fault happens in kernel space	 * (error_code & 4) == 0, and that the fault was not a	 * protection error (error_code & 1) == 0.	 */	if (unlikely(address >= TASK_SIZE)) { 		if (!(error_code & 5))			goto vmalloc_fault;		/* 		 * Don't take the mm semaphore here. If we fixup a prefetch		 * fault we could otherwise deadlock.		 */		goto bad_area_nosemaphore;	} 	mm = tsk->mm;	/*	 * If we're in an interrupt, have no user context or are running in an	 * atomic region then we must not take the fault..	 */	if (in_atomic() || !mm)		goto bad_area_nosemaphore;	/* When running in the kernel we expect faults to occur only to	 * addresses in user space.  All other faults represent errors in the	 * kernel and should generate an OOPS.  Unfortunatly, in the case of an	 * erroneous fault occuring in a code path which already holds mmap_sem	 * we will deadlock attempting to validate the fault against the	 * address space.  Luckily the kernel only validly references user	 * space from well defined areas of code, which are listed in the	 * exceptions table.	 *	 * As the vast majority of faults will be valid we will only perform	 * the source reference check when there is a possibilty of a deadlock.	 * Attempt to lock the address space, if we cannot we then validate the	 * source.  If this is invalid we can skip the address space check,	 * thus avoiding the deadlock.	 */	if (!down_read_trylock(&mm->mmap_sem)) {		if ((error_code & 4) == 0 &&		    !search_exception_tables(regs->eip))			goto bad_area_nosemaphore;		down_read(&mm->mmap_sem);	}	vma = find_vma(mm, address);	if (!vma)		goto bad_area;	if (vma->vm_start <= address)		goto good_area;	if (!(vma->vm_flags & VM_GROWSDOWN))		goto bad_area;	if (error_code & 4) {		/*		 * accessing the stack below %esp is always a bug.		 * The "+ 32" is there due to some instructions (like		 * pusha) doing post-decrement on the stack and that		 * doesn't show up until later..		 */		if (address + 32 < regs->esp)			goto bad_area;	}	if (expand_stack(vma, address))		goto bad_area;/* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */good_area:	si_code = SEGV_ACCERR;	write = 0;	switch (error_code & 3) {		default:	/* 3: write, present */#ifdef TEST_VERIFY_AREA			if (regs->cs == KERNEL_CS)				printk("WP fault at %08lx\n", regs->eip);#endif			/* fall through */		case 2:		/* write, not present */			if (!(vma->vm_flags & VM_WRITE))				goto bad_area;			write++;			break;		case 1:		/* read, present */			goto bad_area;		case 0:		/* read, not present */			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))				goto bad_area;	} survive:	/*	 * If for any reason at all we couldn't handle the fault,	 * make sure we exit gracefully rather than endlessly redo	 * the fault.	 */	switch (handle_mm_fault(mm, vma, address, write)) {		case VM_FAULT_MINOR:			tsk->min_flt++;			break;		case VM_FAULT_MAJOR:			tsk->maj_flt++;			break;		case VM_FAULT_SIGBUS:			goto do_sigbus;		case VM_FAULT_OOM:			goto out_of_memory;		default:			BUG();	}	/*	 * Did it hit the DOS screen memory VA from vm86 mode?	 */	if (regs->eflags & VM_MASK) {		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;		if (bit < 32)			tsk->thread.screen_bitmap |= 1 << bit;	}	up_read(&mm->mmap_sem);	return;/* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */bad_area:	up_read(&mm->mmap_sem);bad_area_nosemaphore:	/* User mode accesses just cause a SIGSEGV */	if (error_code & 4) {		/* 		 * Valid to do another page fault here because this one came 		 * from user space.		 */		if (is_prefetch(regs, address, error_code))			return;		tsk->thread.cr2 = address;		/* Kernel addresses are always protection faults */		tsk->thread.error_code = error_code | (address >= TASK_SIZE);		tsk->thread.trap_no = 14;		force_sig_info_fault(SIGSEGV, si_code, address, tsk);		return;	}#ifdef CONFIG_X86_F00F_BUG	/*	 * Pentium F0 0F C7 C8 bug workaround.	 */	if (boot_cpu_data.f00f_bug) {		unsigned long nr;				nr = (address - idt_descr.address) >> 3;		if (nr == 6) {			do_invalid_op(regs, 0);			return;		}	}#endifno_context:	/* Are we prepared to handle this kernel fault?  */	if (fixup_exception(regs))		return;	/* 	 * Valid to do another page fault here, because if this fault	 * had been triggered by is_prefetch fixup_exception would have 	 * handled it.	 */ 	if (is_prefetch(regs, address, error_code)) 		return;/* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */	bust_spinlocks(1);#ifdef CONFIG_X86_PAE	if (error_code & 16) {		pte_t *pte = lookup_address(address);		if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))			printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);	}#endif	if (address < PAGE_SIZE)		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");	else		printk(KERN_ALERT "Unable to handle kernel paging request");	printk(" at virtual address %08lx\n",address);	printk(KERN_ALERT " printing eip:\n");	printk("%08lx\n", regs->eip);	page = read_cr3();	page = ((unsigned long *) __va(page))[address >> 22];	printk(KERN_ALERT "*pde = %08lx\n", page);	/*	 * We must not directly access the pte in the highpte	 * case, the page table might be allocated in highmem.	 * And lets rather not kmap-atomic the pte, just in case	 * it's allocated already.	 */#ifndef CONFIG_HIGHPTE	if (page & 1) {		page &= PAGE_MASK;		address &= 0x003ff000;		page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];		printk(KERN_ALERT "*pte = %08lx\n", page);	}#endif	tsk->thread.cr2 = address;	tsk->thread.trap_no = 14;	tsk->thread.error_code = error_code;	die("Oops", regs, error_code);	bust_spinlocks(0);	do_exit(SIGKILL);/* * We ran out of memory, or some other thing happened to us that made * us unable to handle the page fault gracefully. */out_of_memory:	up_read(&mm->mmap_sem);	if (tsk->pid == 1) {		yield();		down_read(&mm->mmap_sem);		goto survive;	}	printk("VM: killing process %s\n", tsk->comm);	if (error_code & 4)		do_exit(SIGKILL);	goto no_context;do_sigbus:	up_read(&mm->mmap_sem);	/* Kernel mode? Handle exceptions or die */	if (!(error_code & 4))		goto no_context;	/* User space => ok to do another page fault */	if (is_prefetch(regs, address, error_code))		return;	tsk->thread.cr2 = address;	tsk->thread.error_code = error_code;	tsk->thread.trap_no = 14;	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);	return;vmalloc_fault:	{		/*		 * Synchronize this task's top level page-table		 * with the 'reference' page table.		 *		 * Do _not_ use "tsk" here. We might be inside		 * an interrupt in the middle of a task switch..		 */		int index = pgd_index(address);		unsigned long pgd_paddr;		pgd_t *pgd, *pgd_k;		pud_t *pud, *pud_k;		pmd_t *pmd, *pmd_k;		pte_t *pte_k;		pgd_paddr = read_cr3();		pgd = index + (pgd_t *)__va(pgd_paddr);		pgd_k = init_mm.pgd + index;		if (!pgd_present(*pgd_k))			goto no_context;		/*		 * set_pgd(pgd, *pgd_k); here would be useless on PAE		 * and redundant with the set_pmd() on non-PAE. As would		 * set_pud.		 */		pud = pud_offset(pgd, address);		pud_k = pud_offset(pgd_k, address);		if (!pud_present(*pud_k))			goto no_context;				pmd = pmd_offset(pud, address);		pmd_k = pmd_offset(pud_k, address);		if (!pmd_present(*pmd_k))			goto no_context;		set_pmd(pmd, *pmd_k);		pte_k = pte_offset_kernel(pmd_k, address);		if (!pte_present(*pte_k))			goto no_context;		return;	}}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -