📄 sched_fair.c
字号:
{}#endifstatic voidaccount_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se){ update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); if (entity_is_task(se)) add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; list_add(&se->group_node, &cfs_rq->tasks);}static voidaccount_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se){ update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); if (entity_is_task(se)) add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; list_del_init(&se->group_node);}static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se){#ifdef CONFIG_SCHEDSTATS if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; if (unlikely(delta > se->sleep_max)) se->sleep_max = delta; se->sleep_start = 0; se->sum_sleep_runtime += delta; account_scheduler_latency(tsk, delta >> 10, 1); } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; if (unlikely(delta > se->block_max)) se->block_max = delta; se->block_start = 0; se->sum_sleep_runtime += delta; /* * Blocking time is in units of nanosecs, so shift by 20 to * get a milliseconds-range estimation of the amount of * time that the task spent sleeping: */ if (unlikely(prof_on == SLEEP_PROFILING)) { profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), delta >> 20); } account_scheduler_latency(tsk, delta >> 10, 0); }#endif}static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se){#ifdef CONFIG_SCHED_DEBUG s64 d = se->vruntime - cfs_rq->min_vruntime; if (d < 0) d = -d; if (d > 3*sysctl_sched_latency) schedstat_inc(cfs_rq, nr_spread_over);#endif}static voidplace_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial){ u64 vruntime; if (first_fair(cfs_rq)) { vruntime = min_vruntime(cfs_rq->min_vruntime, __pick_next_entity(cfs_rq)->vruntime); } else vruntime = cfs_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, * however the extra weight of the new task will slow them down a * little, place the new task so that it fits in the slot that * stays open at the end. */ if (initial && sched_feat(START_DEBIT)) vruntime += sched_vslice_add(cfs_rq, se); if (!initial) { /* sleeps upto a single latency don't count. */ if (sched_feat(NEW_FAIR_SLEEPERS)) { unsigned long thresh = sysctl_sched_latency; /* * convert the sleeper threshold into virtual time */ if (sched_feat(NORMALIZED_SLEEPER)) thresh = calc_delta_fair(thresh, se); vruntime -= thresh; } /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); } se->vruntime = vruntime;}static voidenqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup){ /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); if (wakeup) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se);}static voiddequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep){ /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); update_stats_dequeue(cfs_rq, se); if (sleep) {#ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) se->sleep_start = rq_of(cfs_rq)->clock; if (tsk->state & TASK_UNINTERRUPTIBLE) se->block_start = rq_of(cfs_rq)->clock; }#endif } if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se);}/* * Preempt the current task with a newly woken task if needed: */static voidcheck_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr){ unsigned long ideal_runtime, delta_exec; ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) resched_task(rq_of(cfs_rq)->curr);}static voidset_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se){ /* 'current' is not kept within the tree. */ if (se->on_rq) { /* * Any task has to be enqueued before it get to execute on * a CPU. So account for the time it spent waiting on the * runqueue. */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se;#ifdef CONFIG_SCHEDSTATS /* * Track our maximum slice length, if the CPU's load is at * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->slice_max = max(se->slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); }#endif se->prev_sum_exec_runtime = se->sum_exec_runtime;}static struct sched_entity *pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se){ struct rq *rq = rq_of(cfs_rq); u64 pair_slice = rq->clock - cfs_rq->pair_start; if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { cfs_rq->pair_start = rq->clock; return se; } return cfs_rq->next;}static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq){ struct sched_entity *se = NULL; if (first_fair(cfs_rq)) { se = __pick_next_entity(cfs_rq); se = pick_next(cfs_rq, se); set_next_entity(cfs_rq, se); } return se;}static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev){ /* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: */ if (prev->on_rq) update_curr(cfs_rq); check_spread(cfs_rq, prev); if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); } cfs_rq->curr = NULL;}static voidentity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued){ /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq);#ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */ if (queued) { resched_task(rq_of(cfs_rq)->curr); return; } /* * don't let the period tick interfere with the hrtick preemption */ if (!sched_feat(DOUBLE_TICK) && hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return;#endif if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) check_preempt_tick(cfs_rq, curr);}/************************************************** * CFS operations on tasks: */#ifdef CONFIG_SCHED_HRTICKstatic void hrtick_start_fair(struct rq *rq, struct task_struct *p){ struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); WARN_ON(task_rq(p) != rq); if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; if (delta < 0) { if (rq->curr == p) resched_task(p); return; } /* * Don't schedule slices shorter than 10000ns, that just * doesn't make sense. Rely on vruntime for fairness. */ if (rq->curr != p) delta = max_t(s64, 10000LL, delta); hrtick_start(rq, delta); }}#else /* !CONFIG_SCHED_HRTICK */static inline voidhrtick_start_fair(struct rq *rq, struct task_struct *p){}#endif/* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup){ struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, wakeup); wakeup = 1; } hrtick_start_fair(rq, rq->curr);}/* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep){ struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, sleep); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; sleep = 1; } hrtick_start_fair(rq, rq->curr);}/* * sched_yield() support is very simple - we dequeue and enqueue. * * If compat_yield is turned on then we requeue to the end of the tree. */static void yield_task_fair(struct rq *rq){ struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *rightmost, *se = &curr->se; /* * Are we the only task in the tree? */ if (unlikely(cfs_rq->nr_running == 1)) return; if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); return; } /* * Find the rightmost entry in the rbtree: */ rightmost = __pick_last_entity(cfs_rq); /* * Already in the rightmost position? */ if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) return; /* * Minimally necessary key value to be last in the tree: * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. */ se->vruntime = rightmost->vruntime + 1;}/* * wake_idle() will wake a task on an idle cpu if task->cpu is * not idle and an idle cpu is available. The span of cpus to * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, * hence we need to mask them out (cpu_active_map) * * Returns the CPU we should wake onto. */#if defined(ARCH_HAS_SCHED_WAKE_IDLE)static int wake_idle(int cpu, struct task_struct *p){ cpumask_t tmp; struct sched_domain *sd; int i; /* * If it is idle, then it is the best cpu to run this task. * * This cpu is also the best, if it has more than one task already. * Siblings must be also busy(in most cases) as they didn't already * pickup the extra load from this cpu and hence we need not check * sibling runqueue info. This will avoid the checks and cache miss * penalities associated with that. */ if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) return cpu; for_each_domain(cpu, sd) { if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { cpus_and(tmp, sd->span, p->cpus_allowed); cpus_and(tmp, tmp, cpu_active_map); for_each_cpu_mask_nr(i, tmp) { if (idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); } return i; } } } else { break; } } return cpu;}#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/static inline int wake_idle(int cpu, struct task_struct *p){ return cpu;}#endif#ifdef CONFIG_SMPstatic const struct sched_class fair_sched_class;#ifdef CONFIG_FAIR_GROUP_SCHED/* * effective_load() calculates the load change as seen from the root_task_group * * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. * * The problem is that perfectly aligning the shares is rather expensive, hence * we try to avoid doing that too often - see update_shares(), which ratelimits * this change. * * We compensate this by not only taking the current delta into account, but * also considering the delta between when the shares were last adjusted and * now. * * We still saw a performance dip, some tracing learned us that between * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased * significantly. Therefore try to bias the error in direction of failing * the affine wakeup. * */static long effective_load(struct task_group *tg, int cpu, long wl, long wg){ struct sched_entity *se = tg->se[cpu]; long more_w; if (!tg->parent) return wl; /* * By not taking the decrease of shares on the other cpu into * account our error leans towards reducing the affine wakeups. */ if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; /* * Instead of using this increment, also add the difference * between when the shares were last updated and now. */ more_w = se->my_q->load.weight - se->my_q->rq_weight; wl += more_w; wg += more_w; for_each_sched_entity(se) {#define D(n) (likely(n) ? (n) : 1) long S, rw, s, a, b; S = se->my_q->tg->shares; s = se->my_q->shares; rw = se->my_q->rq_weight; a = S*(rw + wl); b = S*rw + s*wg; wl = s*(a-b)/D(b); /* * Assume the group is already running and will * thus already be accounted for in the weight. * * That is, moving shares between CPUs, does not * alter the group weight. */ wg = 0;#undef D } return wl;}#elsestatic inline unsigned long effective_load(struct task_group *tg, int cpu, unsigned long wl, unsigned long wg){ return wl;}#endifstatic intwake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance){ struct task_struct *curr = this_rq->curr; struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; unsigned long weight; int balanced;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -