📄 sched.c
字号:
/* * kernel/sched.c * * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: * hybrid priority-list and round-robin design with * an array-switch method of distributing timeslices * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin */#include <linux/mm.h>#include <linux/module.h>#include <linux/nmi.h>#include <linux/init.h>#include <asm/uaccess.h>#include <linux/highmem.h>#include <linux/smp_lock.h>#include <asm/mmu_context.h>#include <linux/interrupt.h>#include <linux/capability.h>#include <linux/completion.h>#include <linux/kernel_stat.h>#include <linux/debug_locks.h>#include <linux/security.h>#include <linux/notifier.h>#include <linux/profile.h>#include <linux/freezer.h>#include <linux/vmalloc.h>#include <linux/blkdev.h>#include <linux/delay.h>#include <linux/smp.h>#include <linux/threads.h>#include <linux/timer.h>#include <linux/rcupdate.h>#include <linux/cpu.h>#include <linux/cpuset.h>#include <linux/percpu.h>#include <linux/kthread.h>#include <linux/seq_file.h>#include <linux/syscalls.h>#include <linux/times.h>#include <linux/tsacct_kern.h>#include <linux/kprobes.h>#include <linux/delayacct.h>#include <linux/reciprocal_div.h>#include <asm/tlb.h>#include <asm/unistd.h>/* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */unsigned long long __attribute__((weak)) sched_clock(void){ return (unsigned long long)jiffies * (1000000000 / HZ);}/* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. */#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)/* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, * it's a [ 0 ... 39 ] range. */#define USER_PRIO(p) ((p)-MAX_RT_PRIO)#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))/* * Some helpers for converting nanosecond timing to jiffy resolution */#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))/* * These are the 'tuning knobs' of the scheduler: * * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), * default timeslice is 100 msecs, maximum timeslice is 800 msecs. * Timeslices get refilled after they expire. */#define MIN_TIMESLICE max(5 * HZ / 1000, 1)#define DEF_TIMESLICE (100 * HZ / 1000)#define ON_RUNQUEUE_WEIGHT 30#define CHILD_PENALTY 95#define PARENT_PENALTY 100#define EXIT_WEIGHT 3#define PRIO_BONUS_RATIO 25#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)#define INTERACTIVE_DELTA 2#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)#define STARVATION_LIMIT (MAX_SLEEP_AVG)#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))/* * If a task is 'interactive' then we reinsert it in the active * array after it has expired its current timeslice. (it will not * continue to run immediately, it will still roundrobin with * other interactive tasks.) * * This part scales the interactivity limit depending on niceness. * * We scale it linearly, offset by the INTERACTIVE_DELTA delta. * Here are a few examples of different nice levels: * * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] * * (the X axis represents the possible -5 ... 0 ... +5 dynamic * priority range a task can explore, a value of '1' means the * task is rated interactive.) * * Ie. nice +19 tasks can never get 'interactive' enough to be * reinserted into the active array. And only heavily CPU-hog nice -20 * tasks will be expired. Default nice 0 tasks are somewhere between, * it takes some effort for them to get interactive, but it's not * too hard. */#define CURRENT_BONUS(p) \ (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ MAX_SLEEP_AVG)#define GRANULARITY (10 * HZ / 1000 ? : 1)#ifdef CONFIG_SMP#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ num_online_cpus())#else#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))#endif#define SCALE(v1,v1_max,v2_max) \ (v1) * (v2_max) / (v1_max)#define DELTA(p) \ (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ INTERACTIVE_DELTA)#define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p))#define INTERACTIVE_SLEEP(p) \ (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))#define TASK_PREEMPTS_CURR(p, rq) \ ((p)->prio < (rq)->curr->prio)#define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)static unsigned int static_prio_timeslice(int static_prio){ if (static_prio < NICE_TO_PRIO(0)) return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); else return SCALE_PRIO(DEF_TIMESLICE, static_prio);}#ifdef CONFIG_SMP/* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) * Since cpu_power is a 'constant', we can use a reciprocal divide. */static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load){ return reciprocal_divide(load, sg->reciprocal_cpu_power);}/* * Each time a sched group cpu_power is changed, * we must compute its reciprocal value */static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val){ sg->__cpu_power += val; sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);}#endif/* * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] * to time slice values: [800ms ... 100ms ... 5ms] * * The higher a thread's priority, the bigger timeslices * it gets during one round of execution. But even the lowest * priority thread gets MIN_TIMESLICE worth of execution time. */static inline unsigned int task_timeslice(struct task_struct *p){ return static_prio_timeslice(p->static_prio);}/* * These are the runqueue data structures: */struct prio_array { unsigned int nr_active; DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ struct list_head queue[MAX_PRIO];};/* * This is the main, per-CPU runqueue data structure. * * Locking rule: those places that want to lock multiple runqueues * (such as the load balancing or the thread migration code), lock * acquire operations must be ordered by ascending &runqueue. */struct rq { spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; unsigned long raw_weighted_load;#ifdef CONFIG_SMP unsigned long cpu_load[3]; unsigned char idle_at_tick;#ifdef CONFIG_NO_HZ unsigned char in_nohz_recently;#endif#endif unsigned long long nr_switches; /* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ unsigned long nr_uninterruptible; unsigned long expired_timestamp; /* Cached timestamp set by update_cpu_clock() */ unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; struct prio_array *active, *expired, arrays[2]; int best_expired_prio; atomic_t nr_iowait;#ifdef CONFIG_SMP struct sched_domain *sd; /* For active balancing */ int active_balance; int push_cpu; int cpu; /* cpu of this runqueue */ struct task_struct *migration_thread; struct list_head migration_queue;#endif#ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; /* sys_sched_yield() stats */ unsigned long yld_exp_empty; unsigned long yld_act_empty; unsigned long yld_both_empty; unsigned long yld_cnt; /* schedule() stats */ unsigned long sched_switch; unsigned long sched_cnt; unsigned long sched_goidle; /* try_to_wake_up() stats */ unsigned long ttwu_cnt; unsigned long ttwu_local;#endif struct lock_class_key rq_lock_key;};static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;static DEFINE_MUTEX(sched_hotcpu_mutex);static inline int cpu_of(struct rq *rq){#ifdef CONFIG_SMP return rq->cpu;#else return 0;#endif}/* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. * * The domain tree of any CPU may only be accessed from within * preempt-disabled sections. */#define for_each_domain(cpu, __sd) \ for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))#define this_rq() (&__get_cpu_var(runqueues))#define task_rq(p) cpu_rq(task_cpu(p))#define cpu_curr(cpu) (cpu_rq(cpu)->curr)#ifndef prepare_arch_switch# define prepare_arch_switch(next) do { } while (0)#endif#ifndef finish_arch_switch# define finish_arch_switch(prev) do { } while (0)#endif#ifndef __ARCH_WANT_UNLOCKED_CTXSWstatic inline int task_running(struct rq *rq, struct task_struct *p){ return rq->curr == p;}static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next){}static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev){#ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current;#endif /* * If we are tracking spinlock dependencies then we have to * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); spin_unlock_irq(&rq->lock);}#else /* __ARCH_WANT_UNLOCKED_CTXSW */static inline int task_running(struct rq *rq, struct task_struct *p){#ifdef CONFIG_SMP return p->oncpu;#else return rq->curr == p;#endif}static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next){#ifdef CONFIG_SMP /* * We can optimise this out completely for !SMP, because the * SMP rebalancing from interrupt is the only thing that cares * here. */ next->oncpu = 1;#endif#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW spin_unlock_irq(&rq->lock);#else spin_unlock(&rq->lock);#endif}static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev){#ifdef CONFIG_SMP /* * After ->oncpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. */ smp_wmb(); prev->oncpu = 0;#endif#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable();#endif}#endif /* __ARCH_WANT_UNLOCKED_CTXSW *//* * __task_rq_lock - lock the runqueue a given task resides on. * Must be called interrupts disabled. */static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock){ struct rq *rq;repeat_lock_task: rq = task_rq(p); spin_lock(&rq->lock); if (unlikely(rq != task_rq(p))) { spin_unlock(&rq->lock); goto repeat_lock_task; } return rq;}/* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) __acquires(rq->lock){ struct rq *rq;repeat_lock_task: local_irq_save(*flags); rq = task_rq(p); spin_lock(&rq->lock); if (unlikely(rq != task_rq(p))) { spin_unlock_irqrestore(&rq->lock, *flags); goto repeat_lock_task; } return rq;}static inline void __task_rq_unlock(struct rq *rq) __releases(rq->lock){ spin_unlock(&rq->lock);}static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) __releases(rq->lock){ spin_unlock_irqrestore(&rq->lock, *flags);}#ifdef CONFIG_SCHEDSTATS/* * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */#define SCHEDSTAT_VERSION 14static int show_schedstat(struct seq_file *seq, void *v){ int cpu; seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); seq_printf(seq, "timestamp %lu\n", jiffies); for_each_online_cpu(cpu) { struct rq *rq = cpu_rq(cpu);#ifdef CONFIG_SMP struct sched_domain *sd; int dcnt = 0;#endif /* runqueue-specific stats */ seq_printf(seq, "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", cpu, rq->yld_both_empty, rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, rq->sched_switch, rq->sched_cnt, rq->sched_goidle, rq->ttwu_cnt, rq->ttwu_local, rq->rq_sched_info.cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); seq_printf(seq, "\n");#ifdef CONFIG_SMP /* domain-specific stats */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -