📄 memory.c
字号:
/* * linux/mm/memory.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds *//* * demand-loading started 01.12.91 - seems it is high on the list of * things wanted, and it should be easy to implement. - Linus *//* * Ok, demand-loading was easy, shared pages a little bit tricker. Shared * pages started 02.12.91, seems to work. - Linus. * * Tested sharing by executing about 30 /bin/sh: under the old kernel it * would have taken more than the 6M I have free, but it worked well as * far as I could see. * * Also corrected some "invalidate()"s - I wasn't doing enough of them. *//* * Real VM (paging to/from disk) started 18.12.91. Much more work and * thought has to go into this. Oh, well.. * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. * Found it. Everything seems to work now. * 20.12.91 - Ok, making the swap-device changeable like the root. *//* * 05.04.94 - Multi-page memory management added for v1.1. * Idea by Alex Bligh (alex@cconcepts.co.uk) * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) * * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) */#include <linux/kernel_stat.h>#include <linux/mm.h>#include <linux/hugetlb.h>#include <linux/mman.h>#include <linux/swap.h>#include <linux/highmem.h>#include <linux/pagemap.h>#include <linux/rmap.h>#include <linux/module.h>#include <linux/delayacct.h>#include <linux/init.h>#include <linux/writeback.h>#include <linux/memcontrol.h>#include <linux/mmu_notifier.h>#include <linux/kallsyms.h>#include <linux/swapops.h>#include <linux/elf.h>#include <asm/pgalloc.h>#include <asm/uaccess.h>#include <asm/tlb.h>#include <asm/tlbflush.h>#include <asm/pgtable.h>#include "internal.h"#ifndef CONFIG_NEED_MULTIPLE_NODES/* use the per-pgdat data instead for discontigmem - mbligh */unsigned long max_mapnr;struct page *mem_map;EXPORT_SYMBOL(max_mapnr);EXPORT_SYMBOL(mem_map);#endifunsigned long num_physpages;/* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL * and ZONE_HIGHMEM. */void * high_memory;EXPORT_SYMBOL(num_physpages);EXPORT_SYMBOL(high_memory);/* * Randomize the address space (stacks, mmaps, brk, etc.). * * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, * as ancient (libc5 based) binaries can segfault. ) */int randomize_va_space __read_mostly =#ifdef CONFIG_COMPAT_BRK 1;#else 2;#endifstatic int __init disable_randmaps(char *s){ randomize_va_space = 0; return 1;}__setup("norandmaps", disable_randmaps);/* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but * very seldom) called out from the p?d_none_or_clear_bad macros. */void pgd_clear_bad(pgd_t *pgd){ pgd_ERROR(*pgd); pgd_clear(pgd);}void pud_clear_bad(pud_t *pud){ pud_ERROR(*pud); pud_clear(pud);}void pmd_clear_bad(pmd_t *pmd){ pmd_ERROR(*pmd); pmd_clear(pmd);}/* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd){ pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token); tlb->mm->nr_ptes--;}static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling){ pmd_t *pmd; unsigned long next; unsigned long start; start = addr; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; free_pte_range(tlb, pmd); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; if (start < floor) return; if (ceiling) { ceiling &= PUD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd);}static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling){ pud_t *pud; unsigned long next; unsigned long start; start = addr; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) return; if (ceiling) { ceiling &= PGDIR_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud);}/* * This function frees user-level page tables of a process. * * Must be called with pagetable lock held. */void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling){ pgd_t *pgd; unsigned long next; unsigned long start; /* * The next few lines have given us lots of grief... * * Why are we testing PMD* at this top level? Because often * there will be no work to do at all, and we'd prefer not to * go all the way down to the bottom just to discover that. * * Why all these "- 1"s? Because 0 represents both the bottom * of the address space and the top of it (using -1 for the * top wouldn't help much: the masks would do the wrong thing). * The rule is that addr 0 and floor 0 refer to the bottom of * the address space, but end 0 and ceiling 0 refer to the top * Comparisons need to use "end - 1" and "ceiling - 1" (though * that end 0 case should be mythical). * * Wherever addr is brought up or ceiling brought down, we must * be careful to reject "the opposite 0" before it confuses the * subsequent tests. But what about where end is brought down * by PMD_SIZE below? no, end can't go down to 0 there. * * Whereas we round start (addr) and ceiling down, by different * masks at different levels, in order to test whether a table * now has no other vmas using it, so can be freed, we don't * bother to round floor or end up - the tests don't need that. */ addr &= PMD_MASK; if (addr < floor) { addr += PMD_SIZE; if (!addr) return; } if (ceiling) { ceiling &= PMD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) end -= PMD_SIZE; if (addr > end - 1) return; start = addr; pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; free_pud_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end);}void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling){ while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; /* * Hide vma from rmap and vmtruncate before freeing pgtables */ anon_vma_unlink(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { /* * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; anon_vma_unlink(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } vma = next; }}int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address){ pgtable_t new = pte_alloc_one(mm, address); if (!new) return -ENOMEM; /* * Ensure all pte setup (eg. pte page lock and page clearing) are * visible before the pte is made visible to other CPUs by being * put into page tables. * * The other side of the story is the pointer chasing in the page * table walking code (when walking the page table without locking; * ie. most of the time). Fortunately, these data accesses consist * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the * smp_read_barrier_depends() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ spin_lock(&mm->page_table_lock); if (!pmd_present(*pmd)) { /* Has another populated it ? */ mm->nr_ptes++; pmd_populate(mm, pmd, new); new = NULL; } spin_unlock(&mm->page_table_lock); if (new) pte_free(mm, new); return 0;}int __pte_alloc_kernel(pmd_t *pmd, unsigned long address){ pte_t *new = pte_alloc_one_kernel(&init_mm, address); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); if (!pmd_present(*pmd)) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); return 0;}static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss){ if (file_rss) add_mm_counter(mm, file_rss, file_rss); if (anon_rss) add_mm_counter(mm, anon_rss, anon_rss);}/* * This function is called to print an error when a bad pte * is found. For example, we might have a PFN-mapped pte in * a region that doesn't allow it. * * The calling function must still handle the error. */static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page){ pgd_t *pgd = pgd_offset(vma->vm_mm, addr); pud_t *pud = pud_offset(pgd, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. */ if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; return; } if (nr_unshown) { printk(KERN_ALERT "BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } nr_shown = 0; } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); printk(KERN_ALERT "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) { printk(KERN_ALERT "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", page, (void *)page->flags, page_count(page), page_mapcount(page), page->mapping, page->index); } printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ if (vma->vm_ops) print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", (unsigned long)vma->vm_ops->fault); if (vma->vm_file && vma->vm_file->f_op) print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", (unsigned long)vma->vm_file->f_op->mmap); dump_stack(); add_taint(TAINT_BAD_PAGE);}static inline int is_cow_mapping(unsigned int flags){ return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;}/* * vm_normal_page -- This function gets the "struct page" associated with a pte. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this * case, NULL is returned here. "Normal" mappings do have a struct page. * * There are 2 broad cases. Firstly, an architecture may define a pte_special() * pte bit, in which case this function is trivial. Secondly, an architecture * may not have a spare pte bit, which requires a more complicated scheme, * described below. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. * * The way we recognize COWed pages within VM_PFNMAP mappings is through the * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit * set, and the vm_pgoff will point to the first PFN mapped: thus every special * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -