📄 futex.c
字号:
/* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 * * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar * (C) Copyright 2003 Red Hat Inc, All Rights Reserved * * Removed page pinning, fix privately mapped COW pages and other cleanups * (C) Copyright 2003, 2004 Jamie Lokier * * Robust futex support started by Ingo Molnar * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * * PI-futex support started by Ingo Molnar and Thomas Gleixner * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. * * "The futexes are also cursed." * "But they come in a choice of three flavours!" * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#include <linux/slab.h>#include <linux/poll.h>#include <linux/fs.h>#include <linux/file.h>#include <linux/jhash.h>#include <linux/init.h>#include <linux/futex.h>#include <linux/mount.h>#include <linux/pagemap.h>#include <linux/syscalls.h>#include <linux/signal.h>#include <linux/module.h>#include <asm/futex.h>#include "rtmutex_common.h"#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)/* * Priority Inheritance state: */struct futex_pi_state { /* * list of 'owned' pi_state instances - these have to be * cleaned up in do_exit() if the task exits prematurely: */ struct list_head list; /* * The PI object: */ struct rt_mutex pi_mutex; struct task_struct *owner; atomic_t refcount; union futex_key key;};/* * We use this hashed waitqueue instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakup is always to make the first condition true, then * wake up q->waiters, then make the second condition true. */struct futex_q { struct plist_node list; wait_queue_head_t waiters; /* Which hash list lock to use: */ spinlock_t *lock_ptr; /* Key which the futex is hashed on: */ union futex_key key; /* For fd, sigio sent using these: */ int fd; struct file *filp; /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; struct task_struct *task;};/* * Split the global futex_lock into every hash list lock. */struct futex_hash_bucket { spinlock_t lock; struct plist_head chain;};static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];/* Futex-fs vfsmount entry: */static struct vfsmount *futex_mnt;/* * We hash on the keys returned from get_futex_key (see below). */static struct futex_hash_bucket *hash_futex(union futex_key *key){ u32 hash = jhash2((u32*)&key->both.word, (sizeof(key->both.word)+sizeof(key->both.ptr))/4, key->both.offset); return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];}/* * Return 1 if two futex_keys are equal, 0 otherwise. */static inline int match_futex(union futex_key *key1, union futex_key *key2){ return (key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset);}/** * get_futex_key - Get parameters which are the keys for a futex. * @uaddr: virtual address of the futex * @shared: NULL for a PROCESS_PRIVATE futex, * ¤t->mm->mmap_sem for a PROCESS_SHARED futex * @key: address where result is stored. * * Returns a negative error code or 0 * The key words are stored in *key on success. * * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * * fshared is NULL for PROCESS_PRIVATE futexes * For other futexes, it points to ¤t->mm->mmap_sem and * caller must have taken the reader lock. but NOT any spinlocks. */int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, union futex_key *key){ unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct page *page; int err; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; if (unlikely((address % sizeof(u32)) != 0)) return -EINVAL; address -= key->both.offset; /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs * virtual address, we dont even have to find the underlying vma. * Note : We do have to check 'uaddr' is a valid user address, * but access_ok() should be faster than find_vma() */ if (!fshared) { if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) return -EFAULT; key->private.mm = mm; key->private.address = address; return 0; } /* * The futex is hashed differently depending on whether * it's in a shared or private mapping. So check vma first. */ vma = find_extend_vma(mm, address); if (unlikely(!vma)) return -EFAULT; /* * Permissions. */ if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; /* * Private mappings are handled in a simple way. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. Therefore we use * VM_MAYSHARE here, not VM_SHARED which is restricted to shared * mappings of _writable_ handles. */ if (likely(!(vma->vm_flags & VM_MAYSHARE))) { key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ key->private.mm = mm; key->private.address = address; return 0; } /* * Linear file mappings are also simple. */ key->shared.inode = vma->vm_file->f_path.dentry->d_inode; key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff); return 0; } /* * We could walk the page table to read the non-linear * pte, and get the page index without fetching the page * from swap. But that's a lot of code to duplicate here * for a rare case, so we simply fetch the page. */ err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); if (err >= 0) { key->shared.pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); put_page(page); return 0; } return err;}EXPORT_SYMBOL_GPL(get_futex_key);/* * Take a reference to the resource addressed by a key. * Can be called while holding spinlocks. * */inline void get_futex_key_refs(union futex_key *key){ if (key->both.ptr == 0) return; switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: atomic_inc(&key->shared.inode->i_count); break; case FUT_OFF_MMSHARED: atomic_inc(&key->private.mm->mm_count); break; }}EXPORT_SYMBOL_GPL(get_futex_key_refs);/* * Drop a reference to the resource addressed by a key. * The hash bucket spinlock must not be held. */void drop_futex_key_refs(union futex_key *key){ if (key->both.ptr == 0) return; switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: iput(key->shared.inode); break; case FUT_OFF_MMSHARED: mmdrop(key->private.mm); break; }}EXPORT_SYMBOL_GPL(drop_futex_key_refs);static inline int get_futex_value_locked(u32 *dest, u32 __user *from){ int ret; pagefault_disable(); ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); pagefault_enable(); return ret ? -EFAULT : 0;}/* * Fault handling. * if fshared is non NULL, current->mm->mmap_sem is already held */static int futex_handle_fault(unsigned long address, struct rw_semaphore *fshared, int attempt){ struct vm_area_struct * vma; struct mm_struct *mm = current->mm; int ret = -EFAULT; if (attempt > 2) return ret; if (!fshared) down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (vma && address >= vma->vm_start && (vma->vm_flags & VM_WRITE)) { switch (handle_mm_fault(mm, vma, address, 1)) { case VM_FAULT_MINOR: ret = 0; current->min_flt++; break; case VM_FAULT_MAJOR: ret = 0; current->maj_flt++; break; } } if (!fshared) up_read(&mm->mmap_sem); return ret;}/* * PI code: */static int refill_pi_state_cache(void){ struct futex_pi_state *pi_state; if (likely(current->pi_state_cache)) return 0; pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); if (!pi_state) return -ENOMEM; INIT_LIST_HEAD(&pi_state->list); /* pi_mutex gets initialized later */ pi_state->owner = NULL; atomic_set(&pi_state->refcount, 1); current->pi_state_cache = pi_state; return 0;}static struct futex_pi_state * alloc_pi_state(void){ struct futex_pi_state *pi_state = current->pi_state_cache; WARN_ON(!pi_state); current->pi_state_cache = NULL; return pi_state;}static void free_pi_state(struct futex_pi_state *pi_state){ if (!atomic_dec_and_test(&pi_state->refcount)) return; /* * If pi_state->owner is NULL, the owner is most probably dying * and has cleaned up the pi_state already */ if (pi_state->owner) { spin_lock_irq(&pi_state->owner->pi_lock); list_del_init(&pi_state->list); spin_unlock_irq(&pi_state->owner->pi_lock); rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); } if (current->pi_state_cache) kfree(pi_state); else { /* * pi_state->list is already empty. * clear pi_state->owner. * refcount is at 0 - put it back to 1. */ pi_state->owner = NULL; atomic_set(&pi_state->refcount, 1); current->pi_state_cache = pi_state; }}/* * Look up the task based on what TID userspace gave us. * We dont trust it. */static struct task_struct * futex_find_get_task(pid_t pid){ struct task_struct *p; rcu_read_lock(); p = find_task_by_pid(pid); if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) p = ERR_PTR(-ESRCH); else get_task_struct(p); rcu_read_unlock(); return p;}/* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */void exit_pi_state_list(struct task_struct *curr){ struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; union futex_key key; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = hash_futex(&key); spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); spin_lock_irq(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { spin_unlock(&hb->lock); continue; } WARN_ON(pi_state->owner != curr); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; spin_unlock_irq(&curr->pi_lock); rt_mutex_unlock(&pi_state->pi_mutex); spin_unlock(&hb->lock); spin_lock_irq(&curr->pi_lock); } spin_unlock_irq(&curr->pi_lock);}static intlookup_pi_state(u32 uval, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps){ struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; struct plist_head *head; struct task_struct *p; pid_t pid = uval & FUTEX_TID_MASK; head = &hb->chain; plist_for_each_entry_safe(this, next, head, list) { if (match_futex(&this->key, key)) { /* * Another waiter already exists - bump up * the refcount and return its pi_state: */ pi_state = this->pi_state; /* * Userspace might have messed up non PI and PI futexes */ if (unlikely(!pi_state)) return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); WARN_ON(pid && pi_state->owner && pi_state->owner->pid != pid); atomic_inc(&pi_state->refcount); *ps = pi_state; return 0; } } /* * We are the first waiter - try to look up the real owner and attach * the new pi_state to it, but bail out when TID = 0 */ if (!pid) return -ESRCH; p = futex_find_get_task(pid); if (IS_ERR(p)) return PTR_ERR(p); /* * We need to look at the task state flags to figure out, * whether the task is exiting. To protect against the do_exit * change of the task flags, we do this protected by * p->pi_lock:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -