Skip to content

Instantly share code, notes, and snippets.

@invisiblek
Created April 6, 2015 17:29
Show Gist options
  • Save invisiblek/2dc541de01c5ecaf668a to your computer and use it in GitHub Desktop.
Save invisiblek/2dc541de01c5ecaf668a to your computer and use it in GitHub Desktop.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c731d4d..5da13d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1123,7 +1123,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
*
* sched_move_task() holds both and thus holding either pins the cgroup,
- * see set_task_rq().
+ * see task_group().
*
* Furthermore, all task_rq users should acquire both locks, see
* task_rq_lock().
@@ -1611,8 +1611,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
smp_wmb();
raw_spin_lock_irqsave(&p->pi_lock, flags);
- src_cpu = task_cpu(p);
- cpu = src_cpu;
+ src_cpu = cpu = task_cpu(p);
if (!(p->state & state))
goto out;
@@ -1654,6 +1653,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+
+ /* Refresh src_cpu as it could have changed since we last read it */
+ src_cpu = task_cpu(p);
if (src_cpu != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
@@ -1727,7 +1729,8 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_ALL, 0);
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
@@ -2201,11 +2204,73 @@ unsigned long this_cpu_load(void)
}
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ * nr_active = 0;
+ * for_each_possible_cpu(cpu)
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ * - for_each_possible_cpu() is prohibitively expensive on machines with
+ * serious number of cpus, therefore we need to take a distributed approach
+ * to calculating nr_active.
+ *
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ * So assuming nr_active := 0 when we start out -- true per definition, we
+ * can simply take per-cpu deltas and fold those into a global accumulate
+ * to obtain the same result. See calc_load_fold_active().
+ *
+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ * across the machine, we assume 10 ticks is sufficient time for every
+ * cpu to have completed this task.
+ *
+ * This places an upper-bound on the IRQ-off latency of the machine. Then
+ * again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ * this would add another cross-cpu cacheline miss and atomic operation
+ * to the wakeup path. Instead we increment on whatever cpu the task ran
+ * when it went into uninterruptible state and decrement on whatever cpu
+ * did the wakeup. This means that only the sum of nr_uninterruptible over
+ * all cpus yields the correct result.
+ *
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
/* Variables and functions for calc_load */
static atomic_long_t calc_load_tasks;
static unsigned long calc_load_update;
unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads: pointer to dest load array
+ * @offset: offset to add
+ * @shift: shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (avenrun[0] + offset) << shift;
+ loads[1] = (avenrun[1] + offset) << shift;
+ loads[2] = (avenrun[2] + offset) << shift;
+}
static long calc_load_fold_active(struct rq *this_rq)
{
@@ -2222,6 +2287,9 @@ static long calc_load_fold_active(struct rq *this_rq)
return delta;
}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
@@ -2233,30 +2301,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
#ifdef CONFIG_NO_HZ
/*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ * - When we go NO_HZ idle during the window, we can negate our sample
+ * contribution, causing under-accounting.
+ *
+ * We avoid this by keeping two idle-delta counters and flipping them
+ * when the window starts, thus separating old and new NO_HZ load.
+ *
+ * The only trick is the slight shift in index flip for read vs write.
+ *
+ * 0s 5s 10s 15s
+ * +10 +10 +10 +10
+ * |-|-----------|-|-----------|-|-----------|-|
+ * r:0 0 1 1 0 0 1 1 0
+ * w:0 1 1 0 0 1 1 0 0
+ *
+ * This ensures we'll fold the old idle contribution in this window while
+ * accumlating the new one.
+ *
+ * - When we wake up from NO_HZ idle during the window, we push up our
+ * contribution, since we effectively move our sample point to a known
+ * busy state.
+ *
+ * This is solved by pushing the window forward, and thus skipping the
+ * sample, for this cpu (effectively using the idle-delta for this cpu which
+ * was in effect at the time the window opened). This also solves the issue
+ * of having to deal with a cpu having been in NOHZ idle for multiple
+ * LOAD_FREQ intervals.
*
* When making the ILB scale, we should try to pull this in as well.
*/
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+ int idx = calc_load_idx;
+
+ /*
+ * See calc_global_nohz(), if we observe the new index, we also
+ * need to observe the new update time.
+ */
+ smp_rmb();
+
+ /*
+ * If the folding window started, make sure we start writing in the
+ * next idle-delta.
+ */
+ if (!time_before(jiffies, calc_load_update))
+ idx++;
+
+ return idx & 1;
+}
-void calc_load_account_idle(struct rq *this_rq)
+static inline int calc_load_read_idx(void)
{
+ return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+ struct rq *this_rq = this_rq();
long delta;
+ /*
+ * We're going into NOHZ mode, if there's any pending delta, fold it
+ * into the pending idle delta.
+ */
delta = calc_load_fold_active(this_rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks_idle);
+ if (delta) {
+ int idx = calc_load_write_idx();
+ atomic_long_add(delta, &calc_load_idle[idx]);
+ }
}
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
{
- long delta = 0;
+ struct rq *this_rq = this_rq();
/*
- * Its got a race, we don't care...
+ * If we're still before the sample window, we're done.
*/
- if (atomic_long_read(&calc_load_tasks_idle))
- delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
+
+ /*
+ * We woke inside or after the sample window, this means we're already
+ * accounted through the nohz accounting, so skip the entire deal and
+ * sync up for the next window.
+ */
+ this_rq->calc_load_update = calc_load_update;
+ if (time_before(jiffies, this_rq->calc_load_update + 10))
+ this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+ int idx = calc_load_read_idx();
+ long delta = 0;
+
+ if (atomic_long_read(&calc_load_idle[idx]))
+ delta = atomic_long_xchg(&calc_load_idle[idx], 0);
return delta;
}
@@ -2342,66 +2498,39 @@ static void calc_global_nohz(void)
{
long delta, active, n;
- /*
- * If we crossed a calc_load_update boundary, make sure to fold
- * any pending idle changes, the respective CPUs might have
- * missed the tick driven calc_load_account_active() update
- * due to NO_HZ.
- */
- delta = calc_load_fold_idle();
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-
- /*
- * It could be the one fold was all it took, we done!
- */
- if (time_before(jiffies, calc_load_update + 10))
- return;
-
- /*
- * Catch-up, fold however many we are behind still
- */
- delta = jiffies - calc_load_update - 10;
- n = 1 + (delta / LOAD_FREQ);
+ if (!time_before(jiffies, calc_load_update + 10)) {
+ /*
+ * Catch-up, fold however many we are behind still
+ */
+ delta = jiffies - calc_load_update - 10;
+ n = 1 + (delta / LOAD_FREQ);
- active = atomic_long_read(&calc_load_tasks);
- active = active > 0 ? active * FIXED_1 : 0;
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
- calc_load_update += n * LOAD_FREQ;
-}
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
+ calc_load_update += n * LOAD_FREQ;
+ }
-static inline long calc_load_fold_idle(void)
-{
- return 0;
+ /*
+ * Flip the idle index...
+ *
+ * Make sure we first write the new time then flip the index, so that
+ * calc_load_write_idx() will see the new time when it reads the new
+ * index, this avoids a double flip messing things up.
+ */
+ smp_wmb();
+ calc_load_idx++;
}
+#else /* !CONFIG_NO_HZ */
-static void calc_global_nohz(void)
-{
-}
-#endif
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
-/**
- * get_avenrun - get the load average array
- * @loads: pointer to dest load array
- * @offset: offset to add
- * @shift: shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
- loads[0] = (avenrun[0] + offset) << shift;
- loads[1] = (avenrun[1] + offset) << shift;
- loads[2] = (avenrun[2] + offset) << shift;
-}
+#endif /* CONFIG_NO_HZ */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
@@ -2409,11 +2538,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
*/
void calc_global_load(unsigned long ticks)
{
- long active;
+ long active, delta;
if (time_before(jiffies, calc_load_update + 10))
return;
+ /*
+ * Fold the 'old' idle-delta to include all NO_HZ cpus.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
active = atomic_long_read(&calc_load_tasks);
active = active > 0 ? active * FIXED_1 : 0;
@@ -2424,12 +2560,7 @@ void calc_global_load(unsigned long ticks)
calc_load_update += LOAD_FREQ;
/*
- * Account one period with whatever state we found before
- * folding in the nohz state and ageing the entire idle period.
- *
- * This avoids loosing a sample when we go idle between
- * calc_load_account_active() (10 ticks ago) and now and thus
- * under-accounting.
+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
*/
calc_global_nohz();
}
@@ -2446,7 +2577,6 @@ static void calc_load_account_active(struct rq *this_rq)
return;
delta = calc_load_fold_active(this_rq);
- delta += calc_load_fold_idle();
if (delta)
atomic_long_add(delta, &calc_load_tasks);
@@ -2454,6 +2584,10 @@ static void calc_load_account_active(struct rq *this_rq)
}
/*
+ * End of global load-average stuff
+ */
+
+/*
* The exact cpuload at various idx values, calculated at every tick would be
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
*
@@ -2992,6 +3126,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
#endif
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+{
+ u64 temp = (__force u64) rtime;
+
+ temp *= (__force u64) utime;
+
+ if (sizeof(cputime_t) == 4)
+ temp = div_u64(temp, (__force u32) total);
+ else
+ temp = div64_u64(temp, (__force u64) total);
+
+ return (__force cputime_t) temp;
+}
+
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3001,13 +3149,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
*/
rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
+ if (total)
+ utime = scale_utime(utime, rtime, total);
+ else
utime = rtime;
/*
@@ -3034,13 +3178,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
total = cputime.utime + cputime.stime;
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) cputime.utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
+ if (total)
+ utime = scale_utime(cputime.utime, rtime, total);
+ else
utime = rtime;
sig->prev_utime = max(sig->prev_utime, utime);
@@ -4413,13 +4553,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
struct task_struct *p;
int retval;
- get_online_cpus();
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
- put_online_cpus();
return -ESRCH;
}
@@ -4466,7 +4604,6 @@ out_free_cpus_allowed:
free_cpumask_var(cpus_allowed);
out_put_task:
put_task_struct(p);
- put_online_cpus();
return retval;
}
@@ -4509,7 +4646,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
unsigned long flags;
int retval;
- get_online_cpus();
rcu_read_lock();
retval = -ESRCH;
@@ -4522,12 +4658,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
- put_online_cpus();
return retval;
}
@@ -5512,7 +5647,6 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_STARTING:
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@@ -6302,11 +6436,8 @@ int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str)
{
- unsigned long val;
-
- val = simple_strtoul(str, NULL, 0);
- if (val < sched_domain_level_max)
- default_relax_domain_level = val;
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
return 1;
}
@@ -6511,7 +6642,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
if (!sd)
return child;
- set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
@@ -6519,6 +6649,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
child->parent = sd;
}
sd->child = child;
+ set_domain_attribute(sd, attr);
return sd;
}
@@ -6875,34 +7006,66 @@ int __init sched_create_sysfs_power_savings_entries(struct device *dev)
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
+
/*
* Update cpusets according to cpu_active mask. If cpusets are
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
* around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
*/
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED_FROZEN:
+
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ }
+
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+
case CPU_ONLINE:
case CPU_DOWN_FAILED:
cpuset_update_active_cpus();
- return NOTIFY_OK;
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
case CPU_DOWN_PREPARE:
cpuset_update_active_cpus();
- return NOTIFY_OK;
+ break;
+ case CPU_DOWN_PREPARE_FROZEN:
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
void __init sched_init_smp(void)
@@ -6912,14 +7075,12 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
- get_online_cpus();
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
- put_online_cpus();
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
@@ -7377,6 +7538,7 @@ void sched_destroy_group(struct task_group *tg)
*/
void sched_move_task(struct task_struct *tsk)
{
+ struct task_group *tg;
int on_rq, running;
unsigned long flags;
struct rq *rq;
@@ -7391,6 +7553,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
+ tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+ lockdep_is_held(&tsk->sighand->siglock)),
+ struct task_group, css);
+ tg = autogroup_task_group(tsk, tg);
+ tsk->sched_task_group = tg;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 08497b0..62f8598 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5231,11 +5231,15 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (unlikely(task_cpu(p) != this_cpu)) {
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
- }
+ /*
+ * Not only the cpu but also the task_group of the parent might have
+ * been changed after parent->se.parent,cfs_rq were copied to
+ * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+ * of child point to valid ones.
+ */
+ rcu_read_lock();
+ __set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
update_curr(cfs_rq);
@@ -5552,7 +5556,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+ rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
return rr_interval;
}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c95..fdf7522 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
- calc_load_account_idle(rq);
return rq->idle;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index be427c5..3d4b1e2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -560,7 +560,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
static int do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
int i, weight, more = 0;
u64 rt_period;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 34fe64f..9b8ed6b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -538,22 +538,19 @@ DECLARE_PER_CPU(int, sd_llc_id);
/*
* Return the group to which this tasks belongs.
*
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
+ * We cannot use task_subsys_state() and friends because the cgroup
+ * subsystem changes that value before the cgroup_subsys::attach() method
+ * is called, therefore we cannot pin it and might observe the wrong value.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
- struct task_group *tg;
- struct cgroup_subsys_state *css;
-
- css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock));
- tg = container_of(css, struct task_group, css);
-
- return autogroup_task_group(p, tg);
+ return p->sched_task_group;
}
static inline bool task_notify_on_migrate(struct task_struct *p)
@@ -954,8 +951,6 @@ static inline u64 sched_avg_period(void)
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
-void calc_load_account_idle(struct rq *this_rq);
-
#ifdef CONFIG_SCHED_HRTICK
/*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e8..da5eb5b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
{
struct task_struct *stop = rq->stop;
- if (stop && stop->on_rq)
+ if (stop && stop->on_rq) {
+ stop->se.exec_start = rq->clock_task;
return stop;
+ }
return NULL;
}
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
+ struct task_struct *curr = rq->curr;
+ u64 delta_exec;
+
+ delta_exec = rq->clock_task - curr->se.exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq->clock_task;
+ cpuacct_charge(curr, delta_exec);
}
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
static void set_curr_task_stop(struct rq *rq)
{
+ struct task_struct *stop = rq->stop;
+
+ stop->se.exec_start = rq->clock_task;
}
static void switched_to_stop(struct rq *rq, struct task_struct *p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment