📄 mempolicy.c
字号:
err = security_task_movememory(task); if (err) goto out; err = do_migrate_pages(mm, &old, &new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);out: mmput(mm); return err;}/* Retrieve NUMA policy */SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags){ int err; int uninitialized_var(pval); nodemask_t nodes; if (nmask != NULL && maxnode < MAX_NUMNODES) return -EINVAL; err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err;}#ifdef CONFIG_COMPATasmlinkage long compat_sys_get_mempolicy(int __user *policy, compat_ulong_t __user *nmask, compat_ulong_t maxnode, compat_ulong_t addr, compat_ulong_t flags){ long err; unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) nm = compat_alloc_user_space(alloc_size); err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); if (!err && nmask) { err = copy_from_user(bm, nm, alloc_size); /* ensure entire bitmap is zeroed */ err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); err |= compat_put_bitmap(nmask, bm, nr_bits); } return err;}asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, compat_ulong_t maxnode){ long err = 0; unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { err = compat_get_bitmap(bm, nmask, nr_bits); nm = compat_alloc_user_space(alloc_size); err |= copy_to_user(nm, bm, alloc_size); } if (err) return -EFAULT; return sys_set_mempolicy(mode, nm, nr_bits+1);}asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, compat_ulong_t mode, compat_ulong_t __user *nmask, compat_ulong_t maxnode, compat_ulong_t flags){ long err = 0; unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; nodemask_t bm; nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); nm = compat_alloc_user_space(alloc_size); err |= copy_to_user(nm, nodes_addr(bm), alloc_size); } if (err) return -EFAULT; return sys_mbind(start, len, mode, nm, nr_bits+1, flags);}#endif/* * get_vma_policy(@task, @vma, @addr) * @task - task for fallback if vma policy == default * @vma - virtual memory area whose policy is sought * @addr - address in @vma for shared policy lookup * * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. * Current or other task's task mempolicy and non-shared vma policies * are protected by the task's mmap_sem, which must be held for read by * the caller. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */static struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr){ struct mempolicy *pol = task->mempolicy; if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { struct mempolicy *vpol = vma->vm_ops->get_policy(vma, addr); if (vpol) pol = vpol; } else if (vma->vm_policy) pol = vma->vm_policy; } if (!pol) pol = &default_policy; return pol;}/* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation */static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy){ /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->mode == MPOL_BIND) && gfp_zone(gfp) >= policy_zone && cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) return &policy->v.nodes; return NULL;}/* Return a zonelist indicated by gfp for node representing a mempolicy */static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy){ int nd = numa_node_id(); switch (policy->mode) { case MPOL_PREFERRED: if (!(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node; break; case MPOL_BIND: /* * Normally, MPOL_BIND allocations are node-local within the * allowed nodemask. However, if __GFP_THISNODE is set and the * current node is part of the mask, we use the zonelist for * the first node in the mask instead. */ if (unlikely(gfp & __GFP_THISNODE) && unlikely(!node_isset(nd, policy->v.nodes))) nd = first_node(policy->v.nodes); break; case MPOL_INTERLEAVE: /* should not happen */ break; default: BUG(); } return node_zonelist(nd, gfp);}/* Do dynamic interleaving for a process */static unsigned interleave_nodes(struct mempolicy *policy){ unsigned nid, next; struct task_struct *me = current; nid = me->il_next; next = next_node(nid, policy->v.nodes); if (next >= MAX_NUMNODES) next = first_node(policy->v.nodes); if (next < MAX_NUMNODES) me->il_next = next; return nid;}/* * Depending on the memory policy provide a node from which to allocate the * next slab entry. * @policy must be protected by freeing by the caller. If @policy is * the current task's mempolicy, this protection is implicit, as only the * task can change it's policy. The system default policy requires no * such protection. */unsigned slab_node(struct mempolicy *policy){ if (!policy || policy->flags & MPOL_F_LOCAL) return numa_node_id(); switch (policy->mode) { case MPOL_PREFERRED: /* * handled MPOL_F_LOCAL above */ return policy->v.preferred_node; case MPOL_INTERLEAVE: return interleave_nodes(policy); case MPOL_BIND: { /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); return zone->node; } default: BUG(); }}/* Do static interleaving for a VMA with known offset. */static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off){ unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; int c; int nid = -1; if (!nnodes) return numa_node_id(); target = (unsigned int)off % nnodes; c = 0; do { nid = next_node(nid, pol->v.nodes); c++; } while (c <= target); return nid;}/* Determine a node number for interleave */static inline unsigned interleave_nid(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long addr, int shift){ if (vma) { unsigned long off; /* * for small pages, there is no difference between * shift and PAGE_SHIFT, so the bit-shift is safe. * for huge pages, since vm_pgoff is in units of small * pages, we need to shift off the always 0 bits to get * a useful offset. */ BUG_ON(shift < PAGE_SHIFT); off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; return offset_il_node(pol, vma, off); } else return interleave_nodes(pol);}#ifdef CONFIG_HUGETLBFS/* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) * @vma = virtual memory area whose policy is sought * @addr = address in @vma for shared policy lookup and interleave policy * @gfp_flags = for requested zone * @mpol = pointer to mempolicy pointer for reference counted mempolicy * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask * * Returns a zonelist suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. */struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask){ struct zonelist *zl; *mpol = get_vma_policy(current, vma, addr); *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { zl = node_zonelist(interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))), gfp_flags); } else { zl = policy_zonelist(gfp_flags, *mpol); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } return zl;}#endif/* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid){ struct zonelist *zl; struct page *page; zl = node_zonelist(nid, gfp); page = __alloc_pages(gfp, order, zl); if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page;}/** * alloc_page_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. * %GFP_KERNEL kernel allocations, * %GFP_HIGHMEM highmem/user allocations, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. * When VMA is not NULL caller must hold down_read on the mmap_sem of the * mm_struct of the VMA to prevent it from going away. Should be used for * all allocations for pages that will be mapped into * user space. Returns NULL when no page can be allocated. * * Should be called with the mm_sem of the vma hold. */struct page *alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr){ struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; cpuset_update_task_memory_state(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); return alloc_page_interleave(gfp, 0, nid); } zl = policy_zonelist(gfp, pol); if (unlikely(mpol_needs_cond_ref(pol))) { /* * slow path: ref counted shared policy */ struct page *page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); return page; } /* * fast path: default or task policy */ return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));}/** * alloc_pages_current - Allocate pages. * * @gfp: * %GFP_USER user allocation, * %GFP_KERNEL kernel allocation, * %GFP_HIGHMEM highmem allocation, * %GFP_FS don't call back into a file system. * %GFP_ATOMIC don't sleep. * @order: Power of two of allocation size in pages. 0 is a single page. * * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. * * Don't call cpuset_update_task_memory_state() unless * 1) it's ok to take cpuset_sem (can WAIT), and * 2) allocating for current task (not interrupt). */struct page *alloc_pages_current(gfp_t gfp, unsigned order){ struct mempolicy *pol = current->mempolicy; if ((gfp & __GFP_WAIT) && !in_interrupt()) cpuset_update_task_memory_state(); if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE) return alloc_page_interleave(gfp, order, interleave_nodes(pol)); return __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));}EXPORT_SYMBOL(alloc_pages_current);/* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). *//* Slow path of a mempolicy duplicate */struct mempolicy *__mpol_dup(struct mempolicy *old){ struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(old, &mems); } *new = *old; atomic_set(&new->refcnt, 1); return new;}/* * If *frompol needs [has] an extra ref, copy *frompol to *tompol , * eliminate the * MPOL_F_* flags that require conditional ref and * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly * after return. Use the returned value. * * Allows use of a mempolicy for, e.g., multiple allocations with a single * policy lookup, even if the policy needs/has extra ref on lookup. * shmem_readahead needs this. */struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, struct mempolicy *frompol){ if (!mpol_needs_cond_ref(frompol)) return frompol; *tompol = *frompol; tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ __mpol_put(frompol); return tompol;}static int mpol_match_intent(const struct mempolicy *a, const struct mempolicy *b){ if (a->flags != b->flags) return 0; if (!mpol_store_user_nodemask(a)) return 1; return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);}/* Slow path of a mempolicy comparison */int __mpol_equal(struct mempolicy *a, struct mempolicy *b){ if (!a || !b) return 0; if (a->mode != b->mode) return 0; if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) return 0; switch (a->mode) { case MPOL_BIND: /* Fall through */ case MPOL_INTERLEAVE: return nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: return a->v.preferred_node == b->v.preferred_node && a->flags == b->flags; default: BUG(); return 0; }}/* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock spinlock, which should be held * for any accesses to the tree. *//* lookup first element intersecting start-end *//* Caller holds sp->lock */static struct sp_node *sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end){ struct rb_node *n = sp->root.rb_node; while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); if (start >= p->end) n = n->rb_right; else if (end <= p->start) n = n->rb_left; else break; } if (!n) return NULL; for (;;) { struct sp_node *w = NULL; struct rb_node *prev = rb_prev(n); if (!prev) break; w = rb_entry(prev, struct sp_node, nd); if (w->end <= start) break; n = prev; } return rb_entry(n, struct sp_node, nd);}/* Insert a new shared policy into the list. *//* Caller holds sp->lock */static void sp_insert(struct shared_policy *sp, struct sp_node *new){ struct rb_node **p = &sp->root.rb_node; struct rb_node *parent = NULL; struct sp_node *nd; while (*p) { parent = *p; nd = rb_entry(parent, struct sp_node, nd); if (new->start < nd->start) p = &(*p)->rb_left; else if (new->end > nd->end) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, new->policy ? new->policy->mode : 0);}/* Find shared policy intersecting idx */struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -