📄 sched.c
字号:
/* * kernel/sched.c * * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: * hybrid priority-list and round-robin design with * an array-switch method of distributing timeslices * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin * 2007-04-15 Work begun on replacing all interactivity tuning with a * fair scheduling design by Con Kolivas. * 2007-05-05 Load balancing (smp-nice) and other improvements * by Peter Williams * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, * Thomas Gleixner, Mike Kravetz */#include <linux/mm.h>#include <linux/module.h>#include <linux/nmi.h>#include <linux/init.h>#include <linux/uaccess.h>#include <linux/highmem.h>#include <linux/smp_lock.h>#include <asm/mmu_context.h>#include <linux/interrupt.h>#include <linux/capability.h>#include <linux/completion.h>#include <linux/kernel_stat.h>#include <linux/debug_locks.h>#include <linux/security.h>#include <linux/notifier.h>#include <linux/profile.h>#include <linux/freezer.h>#include <linux/vmalloc.h>#include <linux/blkdev.h>#include <linux/delay.h>#include <linux/pid_namespace.h>#include <linux/smp.h>#include <linux/threads.h>#include <linux/timer.h>#include <linux/rcupdate.h>#include <linux/cpu.h>#include <linux/cpuset.h>#include <linux/percpu.h>#include <linux/kthread.h>#include <linux/seq_file.h>#include <linux/sysctl.h>#include <linux/syscalls.h>#include <linux/times.h>#include <linux/tsacct_kern.h>#include <linux/kprobes.h>#include <linux/delayacct.h>#include <linux/reciprocal_div.h>#include <linux/unistd.h>#include <linux/pagemap.h>#include <linux/hrtimer.h>#include <linux/tick.h>#include <linux/bootmem.h>#include <linux/debugfs.h>#include <linux/ctype.h>#include <linux/ftrace.h>#include <asm/tlb.h>#include <asm/irq_regs.h>#include "sched_cpupri.h"/* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. */#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)/* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, * it's a [ 0 ... 39 ] range. */#define USER_PRIO(p) ((p)-MAX_RT_PRIO)#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))/* * Helpers for converting nanosecond timing to jiffy resolution */#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))#define NICE_0_LOAD SCHED_LOAD_SCALE#define NICE_0_SHIFT SCHED_LOAD_SHIFT/* * These are the 'tuning knobs' of the scheduler: * * default timeslice is 100 msecs (used only for SCHED_RR tasks). * Timeslices get refilled after they expire. */#define DEF_TIMESLICE (100 * HZ / 1000)/* * single value that denotes runtime == period, ie unlimited time. */#define RUNTIME_INF ((u64)~0ULL)#ifdef CONFIG_SMP/* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) * Since cpu_power is a 'constant', we can use a reciprocal divide. */static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load){ return reciprocal_divide(load, sg->reciprocal_cpu_power);}/* * Each time a sched group cpu_power is changed, * we must compute its reciprocal value */static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val){ sg->__cpu_power += val; sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);}#endifstatic inline int rt_policy(int policy){ if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) return 1; return 0;}static inline int task_has_rt_policy(struct task_struct *p){ return rt_policy(p->policy);}/* * This is the priority-queue data structure of the RT scheduling class: */struct rt_prio_array { DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ struct list_head queue[MAX_RT_PRIO];};struct rt_bandwidth { /* nests inside the rq lock: */ spinlock_t rt_runtime_lock; ktime_t rt_period; u64 rt_runtime; struct hrtimer rt_period_timer;};static struct rt_bandwidth def_rt_bandwidth;static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer){ struct rt_bandwidth *rt_b = container_of(timer, struct rt_bandwidth, rt_period_timer); ktime_t now; int overrun; int idle = 0; for (;;) { now = hrtimer_cb_get_time(timer); overrun = hrtimer_forward(timer, now, rt_b->rt_period); if (!overrun) break; idle = do_sched_rt_period_timer(rt_b, overrun); } return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;}staticvoid init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime){ rt_b->rt_period = ns_to_ktime(period); rt_b->rt_runtime = runtime; spin_lock_init(&rt_b->rt_runtime_lock); hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;}static void start_rt_bandwidth(struct rt_bandwidth *rt_b){ ktime_t now; if (rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) return; spin_lock(&rt_b->rt_runtime_lock); for (;;) { if (hrtimer_active(&rt_b->rt_period_timer)) break; now = hrtimer_cb_get_time(&rt_b->rt_period_timer); hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); hrtimer_start(&rt_b->rt_period_timer, rt_b->rt_period_timer.expires, HRTIMER_MODE_ABS); } spin_unlock(&rt_b->rt_runtime_lock);}#ifdef CONFIG_RT_GROUP_SCHEDstatic void destroy_rt_bandwidth(struct rt_bandwidth *rt_b){ hrtimer_cancel(&rt_b->rt_period_timer);}#endif/* * sched_domains_mutex serializes calls to arch_init_sched_domains, * detach_destroy_domains and partition_sched_domains. */static DEFINE_MUTEX(sched_domains_mutex);#ifdef CONFIG_GROUP_SCHED#include <linux/cgroup.h>struct cfs_rq;static LIST_HEAD(task_groups);/* task group related information */struct task_group {#ifdef CONFIG_CGROUP_SCHED struct cgroup_subsys_state css;#endif#ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares;#endif#ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; struct rt_bandwidth rt_bandwidth;#endif struct rcu_head rcu; struct list_head list; struct task_group *parent; struct list_head siblings; struct list_head children;};#ifdef CONFIG_USER_SCHED/* * Root task group. * Every UID task group (including init_task_group aka UID-0) will * be a child to this group. */struct task_group root_task_group;#ifdef CONFIG_FAIR_GROUP_SCHED/* Default task group's sched entity on each cpu */static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);/* Default task group's cfs_rq on each cpu */static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;#endif /* CONFIG_FAIR_GROUP_SCHED */#ifdef CONFIG_RT_GROUP_SCHEDstatic DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;#endif /* CONFIG_RT_GROUP_SCHED */#else /* !CONFIG_FAIR_GROUP_SCHED */#define root_task_group init_task_group#endif /* CONFIG_FAIR_GROUP_SCHED *//* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. */static DEFINE_SPINLOCK(task_group_lock);#ifdef CONFIG_FAIR_GROUP_SCHED#ifdef CONFIG_USER_SCHED# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)#else /* !CONFIG_USER_SCHED */# define INIT_TASK_GROUP_LOAD NICE_0_LOAD#endif /* CONFIG_USER_SCHED *//* * A weight of 0 or 1 can cause arithmetics problems. * A weight of a cfs_rq is the sum of weights of which entities * are queued on this cfs_rq, so a weight of a entity should not be * too large, so as the shares value of a task group. * (The default weight is 1024 - so there's no practical * limitation from this.) */#define MIN_SHARES 2#define MAX_SHARES (1UL << 18)static int init_task_group_load = INIT_TASK_GROUP_LOAD;#endif/* Default task group. * Every task in system belong to this group at bootup. */struct task_group init_task_group;/* return group to which a task belongs */static inline struct task_group *task_group(struct task_struct *p){ struct task_group *tg;#ifdef CONFIG_USER_SCHED tg = p->user->tg;#elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css);#else tg = &init_task_group;#endif return tg;}/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */static inline void set_task_rq(struct task_struct *p, unsigned int cpu){#ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu];#endif#ifdef CONFIG_RT_GROUP_SCHED p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.parent = task_group(p)->rt_se[cpu];#endif}#elsestatic inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }static inline struct task_group *task_group(struct task_struct *p){ return NULL;}#endif /* CONFIG_GROUP_SCHED *//* CFS-related fields in a runqueue */struct cfs_rq { struct load_weight load; unsigned long nr_running; u64 exec_clock; u64 min_vruntime; u64 pair_start; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; struct list_head tasks; struct list_head *balance_iterator; /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr, *next; unsigned long nr_spread_over;#ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ /* * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in * a hierarchy). Non-leaf lrqs hold other higher schedulable entities * (like users, containers etc.) * * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This * list is used during load balance. */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */#ifdef CONFIG_SMP /* * the part of load.weight contributed by tasks */ unsigned long task_weight; /* * h_load = weight * f(tg) * * Where f(tg) is the recursive weight fraction assigned to * this group. */ unsigned long h_load; /* * this cpu's part of tg->shares */ unsigned long shares; /* * load.weight at the time we set shares */ unsigned long rq_weight;#endif#endif};/* Real-Time classes' related field in a runqueue: */struct rt_rq { struct rt_prio_array active; unsigned long rt_nr_running;#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED int highest_prio; /* highest queued rt task prio */#endif#ifdef CONFIG_SMP unsigned long rt_nr_migratory; int overloaded;#endif int rt_throttled; u64 rt_time; u64 rt_runtime; /* Nests inside the rq lock: */ spinlock_t rt_runtime_lock;#ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; struct rq *rq; struct list_head leaf_rt_rq_list; struct task_group *tg; struct sched_rt_entity *rt_se;#endif};#ifdef CONFIG_SMP/* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by * fully partitioning the member cpus from any other cpuset. Whenever a new * exclusive cpuset is created, we also create and attach a new root-domain * object. * */struct root_domain { atomic_t refcount; cpumask_t span; cpumask_t online; /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ cpumask_t rto_mask; atomic_t rto_count;#ifdef CONFIG_SMP struct cpupri cpupri;#endif};/* * By default the system creates a single root-domain with all cpus as * members (mimicking the global state we have today). */static struct root_domain def_root_domain;#endif/* * This is the main, per-CPU runqueue data structure. * * Locking rule: those places that want to lock multiple runqueues * (such as the load balancing or the thread migration code), lock * acquire operations must be ordered by ascending &runqueue. */struct rq { /* runqueue lock: */ spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned char idle_at_tick;#ifdef CONFIG_NO_HZ unsigned long last_tick_seen; unsigned char in_nohz_recently;#endif /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; struct cfs_rq cfs; struct rt_rq rt;#ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list;#endif#ifdef CONFIG_RT_GROUP_SCHED struct list_head leaf_rt_rq_list;#endif /* * This is part of a global counter where only the total sum
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -