📄 sched_fair.c
字号:
/* * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) * * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * Interactivity improvements by Mike Galbraith * (C) 2007 Mike Galbraith <efault@gmx.de> * * Various enhancements by Dmitry Adamushko. * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> * * Group scheduling enhancements by Srivatsa Vaddagiri * Copyright IBM Corporation, 2007 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> * * Scaled math optimizations by Thomas Gleixner * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> */#include <linux/latencytop.h>/* * Targeted preemption latency for CPU-bound tasks: * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length * and have no persistent notion like in traditional, time-slice * based scheduling concepts. * * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */unsigned int sysctl_sched_latency = 20000000ULL;/* * Minimal preemption granularity for CPU-bound tasks: * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) */unsigned int sysctl_sched_min_granularity = 4000000ULL;/* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */static unsigned int sched_nr_latency = 5;/* * After fork, child runs first. (default) If set to 0 then * parent will (try to) run first. */const_debug unsigned int sysctl_sched_child_runs_first = 1;/* * sys_sched_yield() compat mode * * This option switches the agressive yield implementation of the * old scheduler back on. */unsigned int __read_mostly sysctl_sched_compat_yield;/* * SCHED_OTHER wake-up granularity. * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */unsigned int sysctl_sched_wakeup_granularity = 5000000UL;const_debug unsigned int sysctl_sched_migration_cost = 500000UL;/************************************************************** * CFS operations on generic schedulable entities: */static inline struct task_struct *task_of(struct sched_entity *se){ return container_of(se, struct task_struct, se);}#ifdef CONFIG_FAIR_GROUP_SCHED/* cpu runqueue to which this cfs_rq is attached */static inline struct rq *rq_of(struct cfs_rq *cfs_rq){ return cfs_rq->rq;}/* An entity is a task if it doesn't "own" a runqueue */#define entity_is_task(se) (!se->my_q)/* Walk up scheduling entities hierarchy */#define for_each_sched_entity(se) \ for (; se; se = se->parent)static inline struct cfs_rq *task_cfs_rq(struct task_struct *p){ return p->se.cfs_rq;}/* runqueue on which this entity is (to be) queued */static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se){ return se->cfs_rq;}/* runqueue "owned" by this group */static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp){ return grp->my_q;}/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on * another cpu ('this_cpu') */static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu){ return cfs_rq->tg->cfs_rq[this_cpu];}/* Iterate thr' all leaf cfs_rq's on a runqueue */#define for_each_leaf_cfs_rq(rq, cfs_rq) \ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)/* Do the two (enqueued) entities belong to the same group ? */static inline intis_same_group(struct sched_entity *se, struct sched_entity *pse){ if (se->cfs_rq == pse->cfs_rq) return 1; return 0;}static inline struct sched_entity *parent_entity(struct sched_entity *se){ return se->parent;}#else /* CONFIG_FAIR_GROUP_SCHED */static inline struct rq *rq_of(struct cfs_rq *cfs_rq){ return container_of(cfs_rq, struct rq, cfs);}#define entity_is_task(se) 1#define for_each_sched_entity(se) \ for (; se; se = NULL)static inline struct cfs_rq *task_cfs_rq(struct task_struct *p){ return &task_rq(p)->cfs;}static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se){ struct task_struct *p = task_of(se); struct rq *rq = task_rq(p); return &rq->cfs;}/* runqueue "owned" by this group */static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp){ return NULL;}static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu){ return &cpu_rq(this_cpu)->cfs;}#define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)static inline intis_same_group(struct sched_entity *se, struct sched_entity *pse){ return 1;}static inline struct sched_entity *parent_entity(struct sched_entity *se){ return NULL;}#endif /* CONFIG_FAIR_GROUP_SCHED *//************************************************************** * Scheduling class tree data structure manipulation methods: */static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime){ s64 delta = (s64)(vruntime - min_vruntime); if (delta > 0) min_vruntime = vruntime; return min_vruntime;}static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime){ s64 delta = (s64)(vruntime - min_vruntime); if (delta < 0) min_vruntime = vruntime; return min_vruntime;}static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se){ return se->vruntime - cfs_rq->min_vruntime;}/* * Enqueue an entity into the rb-tree: */static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se){ struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; s64 key = entity_key(cfs_rq, se); int leftmost = 1; /* * Find the right place in the rbtree: */ while (*link) { parent = *link; entry = rb_entry(parent, struct sched_entity, run_node); /* * We dont care about collisions. Nodes with * the same key stay together. */ if (key < entity_key(cfs_rq, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = 0; } } /* * Maintain a cache of leftmost tree entries (it is frequently * used): */ if (leftmost) { cfs_rq->rb_leftmost = &se->run_node; /* * maintain cfs_rq->min_vruntime to be a monotonic increasing * value tracking the leftmost vruntime in the tree. */ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, se->vruntime); } rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);}static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se){ if (cfs_rq->rb_leftmost == &se->run_node) { struct rb_node *next_node; struct sched_entity *next; next_node = rb_next(&se->run_node); cfs_rq->rb_leftmost = next_node; if (next_node) { next = rb_entry(next_node, struct sched_entity, run_node); cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, next->vruntime); } } if (cfs_rq->next == se) cfs_rq->next = NULL; rb_erase(&se->run_node, &cfs_rq->tasks_timeline);}static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq){ return cfs_rq->rb_leftmost;}static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq){ return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);}static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq){ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); if (!last) return NULL; return rb_entry(last, struct sched_entity, run_node);}/************************************************************** * Scheduling class statistics methods: */#ifdef CONFIG_SCHED_DEBUGint sched_nr_latency_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos){ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); if (ret || !write) return ret; sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); return 0;}#endif/* * delta *= w / rw */static inline unsigned longcalc_delta_weight(unsigned long delta, struct sched_entity *se){ for_each_sched_entity(se) { delta = calc_delta_mine(delta, se->load.weight, &cfs_rq_of(se)->load); } return delta;}/* * delta *= rw / w */static inline unsigned longcalc_delta_fair(unsigned long delta, struct sched_entity *se){ for_each_sched_entity(se) { delta = calc_delta_mine(delta, cfs_rq_of(se)->load.weight, &se->load); } return delta;}/* * The idea is to set a period in which each task runs once. * * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch * this period because otherwise the slices get too small. * * p = (nr <= nl) ? l : l*nr/nl */static u64 __sched_period(unsigned long nr_running){ u64 period = sysctl_sched_latency; unsigned long nr_latency = sched_nr_latency; if (unlikely(nr_running > nr_latency)) { period = sysctl_sched_min_granularity; period *= nr_running; } return period;}/* * We calculate the wall-time slice from the period by taking a part * proportional to the weight. * * s = p*w/rw */static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se){ return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);}/* * We calculate the vruntime slice of a to be inserted task * * vs = s*rw/w = p */static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se){ unsigned long nr_running = cfs_rq->nr_running; if (!se->on_rq) nr_running++; return __sched_period(nr_running);}/* * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in * that it favours >=0 over <0. * * -20 | * | * 0 --------+------- * .' * 19 .' * */static unsigned longcalc_delta_asym(unsigned long delta, struct sched_entity *se){ struct load_weight lw = { .weight = NICE_0_LOAD, .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) }; for_each_sched_entity(se) { struct load_weight *se_lw = &se->load; unsigned long rw = cfs_rq_of(se)->load.weight;#ifdef CONFIG_FAIR_SCHED_GROUP struct cfs_rq *cfs_rq = se->my_q; struct task_group *tg = NULL if (cfs_rq) tg = cfs_rq->tg; if (tg && tg->shares < NICE_0_LOAD) { /* * scale shares to what it would have been had * tg->weight been NICE_0_LOAD: * * weight = 1024 * shares / tg->weight */ lw.weight *= se->load.weight; lw.weight /= tg->shares; lw.inv_weight = 0; se_lw = &lw; rw += lw.weight - se->load.weight; } else#endif if (se->load.weight < NICE_0_LOAD) { se_lw = &lw; rw += NICE_0_LOAD - se->load.weight; } delta = calc_delta_mine(delta, rw, se_lw); } return delta;}/* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */static inline void__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec){ unsigned long delta_exec_weighted; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted;}static void update_curr(struct cfs_rq *cfs_rq){ struct sched_entity *curr = cfs_rq->curr; u64 now = rq_of(cfs_rq)->clock; unsigned long delta_exec; if (unlikely(!curr)) return; /* * Get the amount of time the current task was running * since the last time we changed load (this cannot * overflow on 32 bits): */ delta_exec = (unsigned long)(now - curr->exec_start); __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); cpuacct_charge(curtask, delta_exec); }}static inline voidupdate_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se){ schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);}/* * Task is being enqueued - update stats: */static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se){ /* * Are we enqueueing a waiting task? (for current tasks * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq->curr) update_stats_wait_start(cfs_rq, se);}static voidupdate_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se){ schedstat_set(se->wait_max, max(se->wait_max, rq_of(cfs_rq)->clock - se->wait_start)); schedstat_set(se->wait_count, se->wait_count + 1); schedstat_set(se->wait_sum, se->wait_sum + rq_of(cfs_rq)->clock - se->wait_start); schedstat_set(se->wait_start, 0);}static inline voidupdate_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se){ /* * Mark the end of the wait period if dequeueing a * waiting task: */ if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se);}/* * We are picking a new current task - update its stats: */static inline voidupdate_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se){ /* * We are starting a new run period: */ se->exec_start = rq_of(cfs_rq)->clock;}/************************************************** * Scheduling class queueing methods: */#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHEDstatic voidadd_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight){ cfs_rq->task_weight += weight;}#elsestatic inline voidadd_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -