📄 mempolicy.c
字号:
{ if (p->mempolicy) p->flags |= PF_MEMPOLICY; else p->flags &= ~PF_MEMPOLICY;}static void mpol_set_task_struct_flag(void){ mpol_fix_fork_child_flag(current);}/* Set the process memory policy */static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes){ struct mempolicy *new; struct mm_struct *mm = current->mm; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) return PTR_ERR(new); /* * prevent changing our mempolicy while show_numa_maps() * is using it. * Note: do_set_mempolicy() can be called at init time * with no 'mm'. */ if (mm) down_write(&mm->mmap_sem); mpol_put(current->mempolicy); current->mempolicy = new; mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); if (mm) up_write(&mm->mmap_sem); return 0;}/* * Return nodemask for policy for get_mempolicy() query */static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes){ nodes_clear(*nodes); if (p == &default_policy) return; switch (p->mode) { case MPOL_BIND: /* Fall through */ case MPOL_INTERLEAVE: *nodes = p->v.nodes; break; case MPOL_PREFERRED: if (!(p->flags & MPOL_F_LOCAL)) node_set(p->v.preferred_node, *nodes); /* else return empty node mask for local allocation */ break; default: BUG(); }}static int lookup_node(struct mm_struct *mm, unsigned long addr){ struct page *p; int err; err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); if (err >= 0) { err = page_to_nid(p); put_page(p); } return err;}/* Retrieve NUMA policy */static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags){ int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ *nmask = cpuset_current_mems_allowed; return 0; } if (flags & MPOL_F_ADDR) { /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); if (!vma) { up_read(&mm->mmap_sem); return -EFAULT; } if (vma->vm_ops && vma->vm_ops->get_policy) pol = vma->vm_ops->get_policy(vma, addr); else pol = vma->vm_policy; } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = current->il_next; } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } if (vma) { up_read(¤t->mm->mmap_sem); vma = NULL; } err = 0; if (nmask) get_policy_nodemask(pol, nmask); out: mpol_cond_put(pol); if (vma) up_read(¤t->mm->mmap_sem); return err;}#ifdef CONFIG_MIGRATION/* * page migration */static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags){ /* * Avoid migrating a page that is shared with others. */ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { if (!isolate_lru_page(page)) { list_add_tail(&page->lru, pagelist); } }}static struct page *new_node_page(struct page *page, unsigned long node, int **x){ return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);}/* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */static int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags){ nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; nodes_clear(nmask); node_set(source, nmask); check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) err = migrate_pages(&pagelist, new_node_page, dest); return err;}/* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. * * Returns the number of page that could not be moved. */int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags){ int busy = 0; int err; nodemask_t tmp; err = migrate_prep(); if (err) return err; down_read(&mm->mmap_sem); err = migrate_vmas(mm, from_nodes, to_nodes, flags); if (err) goto out;/* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'tmp', and return that <source, dest> pair for migration. * The pair of nodemasks 'to' and 'from' define the map. * * If no pair of bits is found that way, fallback to picking some * pair of 'source' and 'dest' bits that are not the same. If the * 'source' and 'dest' bits are the same, this represents a node * that will be migrating to itself, so no pages need move. * * If no bits are left in 'tmp', or if all remaining bits left * in 'tmp' correspond to the same bit in 'to', return false * (nothing left to migrate). * * This lets us pick a pair of nodes to migrate between, such that * if possible the dest node is not already occupied by some other * source node, minimizing the risk of overloading the memory on a * node that would happen if we migrated incoming memory to a node * before migrating outgoing memory source that same node. * * A single scan of tmp is sufficient. As we go, we remember the * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. * Otherwise when we finish scannng from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. */ tmp = *from_nodes; while (!nodes_empty(tmp)) { int s,d; int source = -1; int dest = 0; for_each_node_mask(s, tmp) { d = node_remap(s, *from_nodes, *to_nodes); if (s == d) continue; source = s; /* Node moved. Memorize */ dest = d; /* dest not in remaining from nodes? */ if (!node_isset(dest, tmp)) break; } if (source == -1) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) busy += err; if (err < 0) break; }out: up_read(&mm->mmap_sem); if (err < 0) return err; return busy;}/* * Allocate a new page for page migration based on vma policy. * Start assuming that page is mapped by vma pointed to by @private. * Search forward from there, if not. N.B., this assumes that the * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */static struct page *new_vma_page(struct page *page, unsigned long private, int **x){ struct vm_area_struct *vma = (struct vm_area_struct *)private; unsigned long uninitialized_var(address); while (vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) break; vma = vma->vm_next; } /* * if !vma, alloc_page_vma() will use task or system default policy */ return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);}#elsestatic void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags){}int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags){ return -ENOSYS;}static struct page *new_vma_page(struct page *page, unsigned long private, int **x){ return NULL;}#endifstatic long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags){ struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; int err; LIST_HEAD(pagelist); if (flags & ~(unsigned long)(MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = (len + PAGE_SIZE - 1) & PAGE_MASK; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", start, start + len, mode, mode_flags, nmask ? nodes_addr(*nmask)[0] : -1); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { err = migrate_prep(); if (err) return err; } down_write(&mm->mmap_sem); vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); err = PTR_ERR(vma); if (!IS_ERR(vma)) { int nr_failed = 0; err = mbind_range(vma, start, end, new); if (!list_empty(&pagelist)) nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma); if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } up_write(&mm->mmap_sem); mpol_put(new); return err;}/* * User space interface with variable sized bitmaps for nodelists. *//* Copy a node mask from user space. */static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode){ unsigned long k; unsigned long nlongs; unsigned long endmask; --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; nlongs = BITS_TO_LONGS(maxnode); if ((maxnode % BITS_PER_LONG) == 0) endmask = ~0UL; else endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; /* When the user specified more nodes than supported just check if the non supported part is all zero. */ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { if (nlongs > PAGE_SIZE/sizeof(long)) return -EINVAL; for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { unsigned long t; if (get_user(t, nmask + k)) return -EFAULT; if (k == nlongs - 1) { if (t & endmask) return -EINVAL; } else if (t) return -EINVAL; } nlongs = BITS_TO_LONGS(MAX_NUMNODES); endmask = ~0UL; } if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) return -EFAULT; nodes_addr(*nodes)[nlongs-1] &= endmask; return 0;}/* Copy a kernel node mask to user space */static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes){ unsigned long copy = ALIGN(maxnode-1, 64) / 8; const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); if (copy > nbytes) { if (copy > PAGE_SIZE) return -EINVAL; if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; } return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;}SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, unsigned long __user *, nmask, unsigned long, maxnode, unsigned, flags){ nodemask_t nodes; int err; unsigned short mode_flags; mode_flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if (mode >= MPOL_MAX) return -EINVAL; if ((mode_flags & MPOL_F_STATIC_NODES) && (mode_flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_mbind(start, len, mode, mode_flags, &nodes, flags);}/* Set the process memory policy */SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, unsigned long, maxnode){ int err; nodemask_t nodes; unsigned short flags; flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)mode >= MPOL_MAX) return -EINVAL; if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(mode, flags, &nodes);}SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes){ const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm; struct task_struct *task; nodemask_t old; nodemask_t new; nodemask_t task_nodes; int err; err = get_nodes(&old, old_nodes, maxnode); if (err) return err; err = get_nodes(&new, new_nodes, maxnode); if (err) return err; /* Find the mm_struct */ read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); return -ESRCH; } mm = get_task_mm(task); read_unlock(&tasklist_lock); if (!mm) return -EINVAL; /* * Check if this process has the right to modify the specified * process. The right exists if the process has administrative * capabilities, superuser privileges or the same * userid as the target process. */ rcu_read_lock(); tcred = __task_cred(task); if (cred->euid != tcred->suid && cred->euid != tcred->uid && cred->uid != tcred->suid && cred->uid != tcred->uid && !capable(CAP_SYS_NICE)) { rcu_read_unlock(); err = -EPERM; goto out; } rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { err = -EINVAL; goto out; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -