Created
April 6, 2015 17:29
-
-
Save invisiblek/2dc541de01c5ecaf668a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/kernel/sched/core.c b/kernel/sched/core.c | |
index c731d4d..5da13d7 100644 | |
--- a/kernel/sched/core.c | |
+++ b/kernel/sched/core.c | |
@@ -1123,7 +1123,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. | |
* | |
* sched_move_task() holds both and thus holding either pins the cgroup, | |
- * see set_task_rq(). | |
+ * see task_group(). | |
* | |
* Furthermore, all task_rq users should acquire both locks, see | |
* task_rq_lock(). | |
@@ -1611,8 +1611,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |
smp_wmb(); | |
raw_spin_lock_irqsave(&p->pi_lock, flags); | |
- src_cpu = task_cpu(p); | |
- cpu = src_cpu; | |
+ src_cpu = cpu = task_cpu(p); | |
if (!(p->state & state)) | |
goto out; | |
@@ -1654,6 +1653,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |
p->sched_class->task_waking(p); | |
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | |
+ | |
+ /* Refresh src_cpu as it could have changed since we last read it */ | |
+ src_cpu = task_cpu(p); | |
if (src_cpu != cpu) { | |
wake_flags |= WF_MIGRATED; | |
set_task_cpu(p, cpu); | |
@@ -1727,7 +1729,8 @@ out: | |
*/ | |
int wake_up_process(struct task_struct *p) | |
{ | |
- return try_to_wake_up(p, TASK_ALL, 0); | |
+ WARN_ON(task_is_stopped_or_traced(p)); | |
+ return try_to_wake_up(p, TASK_NORMAL, 0); | |
} | |
EXPORT_SYMBOL(wake_up_process); | |
@@ -2201,11 +2204,73 @@ unsigned long this_cpu_load(void) | |
} | |
+/* | |
+ * Global load-average calculations | |
+ * | |
+ * We take a distributed and async approach to calculating the global load-avg | |
+ * in order to minimize overhead. | |
+ * | |
+ * The global load average is an exponentially decaying average of nr_running + | |
+ * nr_uninterruptible. | |
+ * | |
+ * Once every LOAD_FREQ: | |
+ * | |
+ * nr_active = 0; | |
+ * for_each_possible_cpu(cpu) | |
+ * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | |
+ * | |
+ * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | |
+ * | |
+ * Due to a number of reasons the above turns in the mess below: | |
+ * | |
+ * - for_each_possible_cpu() is prohibitively expensive on machines with | |
+ * serious number of cpus, therefore we need to take a distributed approach | |
+ * to calculating nr_active. | |
+ * | |
+ * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | |
+ * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | |
+ * | |
+ * So assuming nr_active := 0 when we start out -- true per definition, we | |
+ * can simply take per-cpu deltas and fold those into a global accumulate | |
+ * to obtain the same result. See calc_load_fold_active(). | |
+ * | |
+ * Furthermore, in order to avoid synchronizing all per-cpu delta folding | |
+ * across the machine, we assume 10 ticks is sufficient time for every | |
+ * cpu to have completed this task. | |
+ * | |
+ * This places an upper-bound on the IRQ-off latency of the machine. Then | |
+ * again, being late doesn't loose the delta, just wrecks the sample. | |
+ * | |
+ * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | |
+ * this would add another cross-cpu cacheline miss and atomic operation | |
+ * to the wakeup path. Instead we increment on whatever cpu the task ran | |
+ * when it went into uninterruptible state and decrement on whatever cpu | |
+ * did the wakeup. This means that only the sum of nr_uninterruptible over | |
+ * all cpus yields the correct result. | |
+ * | |
+ * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | |
+ */ | |
+ | |
/* Variables and functions for calc_load */ | |
static atomic_long_t calc_load_tasks; | |
static unsigned long calc_load_update; | |
unsigned long avenrun[3]; | |
-EXPORT_SYMBOL(avenrun); | |
+EXPORT_SYMBOL(avenrun); /* should be removed */ | |
+ | |
+/** | |
+ * get_avenrun - get the load average array | |
+ * @loads: pointer to dest load array | |
+ * @offset: offset to add | |
+ * @shift: shift count to shift the result left | |
+ * | |
+ * These values are estimates at best, so no need for locking. | |
+ */ | |
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |
+{ | |
+ loads[0] = (avenrun[0] + offset) << shift; | |
+ loads[1] = (avenrun[1] + offset) << shift; | |
+ loads[2] = (avenrun[2] + offset) << shift; | |
+} | |
static long calc_load_fold_active(struct rq *this_rq) | |
{ | |
@@ -2222,6 +2287,9 @@ static long calc_load_fold_active(struct rq *this_rq) | |
return delta; | |
} | |
+/* | |
+ * a1 = a0 * e + a * (1 - e) | |
+ */ | |
static unsigned long | |
calc_load(unsigned long load, unsigned long exp, unsigned long active) | |
{ | |
@@ -2233,30 +2301,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |
#ifdef CONFIG_NO_HZ | |
/* | |
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | |
+ * Handle NO_HZ for the global load-average. | |
+ * | |
+ * Since the above described distributed algorithm to compute the global | |
+ * load-average relies on per-cpu sampling from the tick, it is affected by | |
+ * NO_HZ. | |
+ * | |
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon | |
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta | |
+ * when we read the global state. | |
+ * | |
+ * Obviously reality has to ruin such a delightfully simple scheme: | |
+ * | |
+ * - When we go NO_HZ idle during the window, we can negate our sample | |
+ * contribution, causing under-accounting. | |
+ * | |
+ * We avoid this by keeping two idle-delta counters and flipping them | |
+ * when the window starts, thus separating old and new NO_HZ load. | |
+ * | |
+ * The only trick is the slight shift in index flip for read vs write. | |
+ * | |
+ * 0s 5s 10s 15s | |
+ * +10 +10 +10 +10 | |
+ * |-|-----------|-|-----------|-|-----------|-| | |
+ * r:0 0 1 1 0 0 1 1 0 | |
+ * w:0 1 1 0 0 1 1 0 0 | |
+ * | |
+ * This ensures we'll fold the old idle contribution in this window while | |
+ * accumlating the new one. | |
+ * | |
+ * - When we wake up from NO_HZ idle during the window, we push up our | |
+ * contribution, since we effectively move our sample point to a known | |
+ * busy state. | |
+ * | |
+ * This is solved by pushing the window forward, and thus skipping the | |
+ * sample, for this cpu (effectively using the idle-delta for this cpu which | |
+ * was in effect at the time the window opened). This also solves the issue | |
+ * of having to deal with a cpu having been in NOHZ idle for multiple | |
+ * LOAD_FREQ intervals. | |
* | |
* When making the ILB scale, we should try to pull this in as well. | |
*/ | |
-static atomic_long_t calc_load_tasks_idle; | |
+static atomic_long_t calc_load_idle[2]; | |
+static int calc_load_idx; | |
+ | |
+static inline int calc_load_write_idx(void) | |
+{ | |
+ int idx = calc_load_idx; | |
+ | |
+ /* | |
+ * See calc_global_nohz(), if we observe the new index, we also | |
+ * need to observe the new update time. | |
+ */ | |
+ smp_rmb(); | |
+ | |
+ /* | |
+ * If the folding window started, make sure we start writing in the | |
+ * next idle-delta. | |
+ */ | |
+ if (!time_before(jiffies, calc_load_update)) | |
+ idx++; | |
+ | |
+ return idx & 1; | |
+} | |
-void calc_load_account_idle(struct rq *this_rq) | |
+static inline int calc_load_read_idx(void) | |
{ | |
+ return calc_load_idx & 1; | |
+} | |
+ | |
+void calc_load_enter_idle(void) | |
+{ | |
+ struct rq *this_rq = this_rq(); | |
long delta; | |
+ /* | |
+ * We're going into NOHZ mode, if there's any pending delta, fold it | |
+ * into the pending idle delta. | |
+ */ | |
delta = calc_load_fold_active(this_rq); | |
- if (delta) | |
- atomic_long_add(delta, &calc_load_tasks_idle); | |
+ if (delta) { | |
+ int idx = calc_load_write_idx(); | |
+ atomic_long_add(delta, &calc_load_idle[idx]); | |
+ } | |
} | |
-static long calc_load_fold_idle(void) | |
+void calc_load_exit_idle(void) | |
{ | |
- long delta = 0; | |
+ struct rq *this_rq = this_rq(); | |
/* | |
- * Its got a race, we don't care... | |
+ * If we're still before the sample window, we're done. | |
*/ | |
- if (atomic_long_read(&calc_load_tasks_idle)) | |
- delta = atomic_long_xchg(&calc_load_tasks_idle, 0); | |
+ if (time_before(jiffies, this_rq->calc_load_update)) | |
+ return; | |
+ | |
+ /* | |
+ * We woke inside or after the sample window, this means we're already | |
+ * accounted through the nohz accounting, so skip the entire deal and | |
+ * sync up for the next window. | |
+ */ | |
+ this_rq->calc_load_update = calc_load_update; | |
+ if (time_before(jiffies, this_rq->calc_load_update + 10)) | |
+ this_rq->calc_load_update += LOAD_FREQ; | |
+} | |
+ | |
+static long calc_load_fold_idle(void) | |
+{ | |
+ int idx = calc_load_read_idx(); | |
+ long delta = 0; | |
+ | |
+ if (atomic_long_read(&calc_load_idle[idx])) | |
+ delta = atomic_long_xchg(&calc_load_idle[idx], 0); | |
return delta; | |
} | |
@@ -2342,66 +2498,39 @@ static void calc_global_nohz(void) | |
{ | |
long delta, active, n; | |
- /* | |
- * If we crossed a calc_load_update boundary, make sure to fold | |
- * any pending idle changes, the respective CPUs might have | |
- * missed the tick driven calc_load_account_active() update | |
- * due to NO_HZ. | |
- */ | |
- delta = calc_load_fold_idle(); | |
- if (delta) | |
- atomic_long_add(delta, &calc_load_tasks); | |
- | |
- /* | |
- * It could be the one fold was all it took, we done! | |
- */ | |
- if (time_before(jiffies, calc_load_update + 10)) | |
- return; | |
- | |
- /* | |
- * Catch-up, fold however many we are behind still | |
- */ | |
- delta = jiffies - calc_load_update - 10; | |
- n = 1 + (delta / LOAD_FREQ); | |
+ if (!time_before(jiffies, calc_load_update + 10)) { | |
+ /* | |
+ * Catch-up, fold however many we are behind still | |
+ */ | |
+ delta = jiffies - calc_load_update - 10; | |
+ n = 1 + (delta / LOAD_FREQ); | |
- active = atomic_long_read(&calc_load_tasks); | |
- active = active > 0 ? active * FIXED_1 : 0; | |
+ active = atomic_long_read(&calc_load_tasks); | |
+ active = active > 0 ? active * FIXED_1 : 0; | |
- avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | |
- avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | |
- avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | |
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | |
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | |
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | |
- calc_load_update += n * LOAD_FREQ; | |
-} | |
-#else | |
-void calc_load_account_idle(struct rq *this_rq) | |
-{ | |
-} | |
+ calc_load_update += n * LOAD_FREQ; | |
+ } | |
-static inline long calc_load_fold_idle(void) | |
-{ | |
- return 0; | |
+ /* | |
+ * Flip the idle index... | |
+ * | |
+ * Make sure we first write the new time then flip the index, so that | |
+ * calc_load_write_idx() will see the new time when it reads the new | |
+ * index, this avoids a double flip messing things up. | |
+ */ | |
+ smp_wmb(); | |
+ calc_load_idx++; | |
} | |
+#else /* !CONFIG_NO_HZ */ | |
-static void calc_global_nohz(void) | |
-{ | |
-} | |
-#endif | |
+static inline long calc_load_fold_idle(void) { return 0; } | |
+static inline void calc_global_nohz(void) { } | |
-/** | |
- * get_avenrun - get the load average array | |
- * @loads: pointer to dest load array | |
- * @offset: offset to add | |
- * @shift: shift count to shift the result left | |
- * | |
- * These values are estimates at best, so no need for locking. | |
- */ | |
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |
-{ | |
- loads[0] = (avenrun[0] + offset) << shift; | |
- loads[1] = (avenrun[1] + offset) << shift; | |
- loads[2] = (avenrun[2] + offset) << shift; | |
-} | |
+#endif /* CONFIG_NO_HZ */ | |
/* | |
* calc_load - update the avenrun load estimates 10 ticks after the | |
@@ -2409,11 +2538,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |
*/ | |
void calc_global_load(unsigned long ticks) | |
{ | |
- long active; | |
+ long active, delta; | |
if (time_before(jiffies, calc_load_update + 10)) | |
return; | |
+ /* | |
+ * Fold the 'old' idle-delta to include all NO_HZ cpus. | |
+ */ | |
+ delta = calc_load_fold_idle(); | |
+ if (delta) | |
+ atomic_long_add(delta, &calc_load_tasks); | |
+ | |
active = atomic_long_read(&calc_load_tasks); | |
active = active > 0 ? active * FIXED_1 : 0; | |
@@ -2424,12 +2560,7 @@ void calc_global_load(unsigned long ticks) | |
calc_load_update += LOAD_FREQ; | |
/* | |
- * Account one period with whatever state we found before | |
- * folding in the nohz state and ageing the entire idle period. | |
- * | |
- * This avoids loosing a sample when we go idle between | |
- * calc_load_account_active() (10 ticks ago) and now and thus | |
- * under-accounting. | |
+ * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. | |
*/ | |
calc_global_nohz(); | |
} | |
@@ -2446,7 +2577,6 @@ static void calc_load_account_active(struct rq *this_rq) | |
return; | |
delta = calc_load_fold_active(this_rq); | |
- delta += calc_load_fold_idle(); | |
if (delta) | |
atomic_long_add(delta, &calc_load_tasks); | |
@@ -2454,6 +2584,10 @@ static void calc_load_account_active(struct rq *this_rq) | |
} | |
/* | |
+ * End of global load-average stuff | |
+ */ | |
+ | |
+/* | |
* The exact cpuload at various idx values, calculated at every tick would be | |
* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | |
* | |
@@ -2992,6 +3126,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | |
#endif | |
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | |
+{ | |
+ u64 temp = (__force u64) rtime; | |
+ | |
+ temp *= (__force u64) utime; | |
+ | |
+ if (sizeof(cputime_t) == 4) | |
+ temp = div_u64(temp, (__force u32) total); | |
+ else | |
+ temp = div64_u64(temp, (__force u64) total); | |
+ | |
+ return (__force cputime_t) temp; | |
+} | |
+ | |
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
{ | |
cputime_t rtime, utime = p->utime, total = utime + p->stime; | |
@@ -3001,13 +3149,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
*/ | |
rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | |
- if (total) { | |
- u64 temp = (__force u64) rtime; | |
- | |
- temp *= (__force u64) utime; | |
- do_div(temp, (__force u32) total); | |
- utime = (__force cputime_t) temp; | |
- } else | |
+ if (total) | |
+ utime = scale_utime(utime, rtime, total); | |
+ else | |
utime = rtime; | |
/* | |
@@ -3034,13 +3178,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
total = cputime.utime + cputime.stime; | |
rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | |
- if (total) { | |
- u64 temp = (__force u64) rtime; | |
- | |
- temp *= (__force u64) cputime.utime; | |
- do_div(temp, (__force u32) total); | |
- utime = (__force cputime_t) temp; | |
- } else | |
+ if (total) | |
+ utime = scale_utime(cputime.utime, rtime, total); | |
+ else | |
utime = rtime; | |
sig->prev_utime = max(sig->prev_utime, utime); | |
@@ -4413,13 +4553,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |
struct task_struct *p; | |
int retval; | |
- get_online_cpus(); | |
rcu_read_lock(); | |
p = find_process_by_pid(pid); | |
if (!p) { | |
rcu_read_unlock(); | |
- put_online_cpus(); | |
return -ESRCH; | |
} | |
@@ -4466,7 +4604,6 @@ out_free_cpus_allowed: | |
free_cpumask_var(cpus_allowed); | |
out_put_task: | |
put_task_struct(p); | |
- put_online_cpus(); | |
return retval; | |
} | |
@@ -4509,7 +4646,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |
unsigned long flags; | |
int retval; | |
- get_online_cpus(); | |
rcu_read_lock(); | |
retval = -ESRCH; | |
@@ -4522,12 +4658,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |
goto out_unlock; | |
raw_spin_lock_irqsave(&p->pi_lock, flags); | |
- cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | |
+ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); | |
raw_spin_unlock_irqrestore(&p->pi_lock, flags); | |
out_unlock: | |
rcu_read_unlock(); | |
- put_online_cpus(); | |
return retval; | |
} | |
@@ -5512,7 +5647,6 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | |
unsigned long action, void *hcpu) | |
{ | |
switch (action & ~CPU_TASKS_FROZEN) { | |
- case CPU_STARTING: | |
case CPU_DOWN_FAILED: | |
set_cpu_active((long)hcpu, true); | |
return NOTIFY_OK; | |
@@ -6302,11 +6436,8 @@ int sched_domain_level_max; | |
static int __init setup_relax_domain_level(char *str) | |
{ | |
- unsigned long val; | |
- | |
- val = simple_strtoul(str, NULL, 0); | |
- if (val < sched_domain_level_max) | |
- default_relax_domain_level = val; | |
+ if (kstrtoint(str, 0, &default_relax_domain_level)) | |
+ pr_warn("Unable to set relax_domain_level\n"); | |
return 1; | |
} | |
@@ -6511,7 +6642,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |
if (!sd) | |
return child; | |
- set_domain_attribute(sd, attr); | |
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | |
if (child) { | |
sd->level = child->level + 1; | |
@@ -6519,6 +6649,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |
child->parent = sd; | |
} | |
sd->child = child; | |
+ set_domain_attribute(sd, attr); | |
return sd; | |
} | |
@@ -6875,34 +7006,66 @@ int __init sched_create_sysfs_power_savings_entries(struct device *dev) | |
} | |
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | |
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ | |
+ | |
/* | |
* Update cpusets according to cpu_active mask. If cpusets are | |
* disabled, cpuset_update_active_cpus() becomes a simple wrapper | |
* around partition_sched_domains(). | |
+ * | |
+ * If we come here as part of a suspend/resume, don't touch cpusets because we | |
+ * want to restore it back to its original state upon resume anyway. | |
*/ | |
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |
void *hcpu) | |
{ | |
- switch (action & ~CPU_TASKS_FROZEN) { | |
+ switch (action) { | |
+ case CPU_ONLINE_FROZEN: | |
+ case CPU_DOWN_FAILED_FROZEN: | |
+ | |
+ /* | |
+ * num_cpus_frozen tracks how many CPUs are involved in suspend | |
+ * resume sequence. As long as this is not the last online | |
+ * operation in the resume sequence, just build a single sched | |
+ * domain, ignoring cpusets. | |
+ */ | |
+ num_cpus_frozen--; | |
+ if (likely(num_cpus_frozen)) { | |
+ partition_sched_domains(1, NULL, NULL); | |
+ break; | |
+ } | |
+ | |
+ /* | |
+ * This is the last CPU online operation. So fall through and | |
+ * restore the original sched domains by considering the | |
+ * cpuset configurations. | |
+ */ | |
+ | |
case CPU_ONLINE: | |
case CPU_DOWN_FAILED: | |
cpuset_update_active_cpus(); | |
- return NOTIFY_OK; | |
+ break; | |
default: | |
return NOTIFY_DONE; | |
} | |
+ return NOTIFY_OK; | |
} | |
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | |
void *hcpu) | |
{ | |
- switch (action & ~CPU_TASKS_FROZEN) { | |
+ switch (action) { | |
case CPU_DOWN_PREPARE: | |
cpuset_update_active_cpus(); | |
- return NOTIFY_OK; | |
+ break; | |
+ case CPU_DOWN_PREPARE_FROZEN: | |
+ num_cpus_frozen++; | |
+ partition_sched_domains(1, NULL, NULL); | |
+ break; | |
default: | |
return NOTIFY_DONE; | |
} | |
+ return NOTIFY_OK; | |
} | |
void __init sched_init_smp(void) | |
@@ -6912,14 +7075,12 @@ void __init sched_init_smp(void) | |
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | |
alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | |
- get_online_cpus(); | |
mutex_lock(&sched_domains_mutex); | |
init_sched_domains(cpu_active_mask); | |
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | |
if (cpumask_empty(non_isolated_cpus)) | |
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | |
mutex_unlock(&sched_domains_mutex); | |
- put_online_cpus(); | |
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); | |
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); | |
@@ -7377,6 +7538,7 @@ void sched_destroy_group(struct task_group *tg) | |
*/ | |
void sched_move_task(struct task_struct *tsk) | |
{ | |
+ struct task_group *tg; | |
int on_rq, running; | |
unsigned long flags; | |
struct rq *rq; | |
@@ -7391,6 +7553,12 @@ void sched_move_task(struct task_struct *tsk) | |
if (unlikely(running)) | |
tsk->sched_class->put_prev_task(rq, tsk); | |
+ tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, | |
+ lockdep_is_held(&tsk->sighand->siglock)), | |
+ struct task_group, css); | |
+ tg = autogroup_task_group(tsk, tg); | |
+ tsk->sched_task_group = tg; | |
+ | |
#ifdef CONFIG_FAIR_GROUP_SCHED | |
if (tsk->sched_class->task_move_group) | |
tsk->sched_class->task_move_group(tsk, on_rq); | |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c | |
index 08497b0..62f8598 100644 | |
--- a/kernel/sched/fair.c | |
+++ b/kernel/sched/fair.c | |
@@ -5231,11 +5231,15 @@ static void task_fork_fair(struct task_struct *p) | |
cfs_rq = task_cfs_rq(current); | |
curr = cfs_rq->curr; | |
- if (unlikely(task_cpu(p) != this_cpu)) { | |
- rcu_read_lock(); | |
- __set_task_cpu(p, this_cpu); | |
- rcu_read_unlock(); | |
- } | |
+ /* | |
+ * Not only the cpu but also the task_group of the parent might have | |
+ * been changed after parent->se.parent,cfs_rq were copied to | |
+ * child->se.parent,cfs_rq. So call __set_task_cpu() to make those | |
+ * of child point to valid ones. | |
+ */ | |
+ rcu_read_lock(); | |
+ __set_task_cpu(p, this_cpu); | |
+ rcu_read_unlock(); | |
update_curr(cfs_rq); | |
@@ -5552,7 +5556,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task | |
* idle runqueue: | |
*/ | |
if (rq->cfs.load.weight) | |
- rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); | |
+ rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); | |
return rr_interval; | |
} | |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c | |
index 91b4c95..fdf7522 100644 | |
--- a/kernel/sched/idle_task.c | |
+++ b/kernel/sched/idle_task.c | |
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |
static struct task_struct *pick_next_task_idle(struct rq *rq) | |
{ | |
schedstat_inc(rq, sched_goidle); | |
- calc_load_account_idle(rq); | |
return rq->idle; | |
} | |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c | |
index be427c5..3d4b1e2 100644 | |
--- a/kernel/sched/rt.c | |
+++ b/kernel/sched/rt.c | |
@@ -560,7 +560,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) | |
static int do_balance_runtime(struct rt_rq *rt_rq) | |
{ | |
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | |
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd; | |
+ struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; | |
int i, weight, more = 0; | |
u64 rt_period; | |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h | |
index 34fe64f..9b8ed6b 100644 | |
--- a/kernel/sched/sched.h | |
+++ b/kernel/sched/sched.h | |
@@ -538,22 +538,19 @@ DECLARE_PER_CPU(int, sd_llc_id); | |
/* | |
* Return the group to which this tasks belongs. | |
* | |
- * We use task_subsys_state_check() and extend the RCU verification with | |
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | |
- * task it moves into the cgroup. Therefore by holding either of those locks, | |
- * we pin the task to the current cgroup. | |
+ * We cannot use task_subsys_state() and friends because the cgroup | |
+ * subsystem changes that value before the cgroup_subsys::attach() method | |
+ * is called, therefore we cannot pin it and might observe the wrong value. | |
+ * | |
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup | |
+ * core changes this before calling sched_move_task(). | |
+ * | |
+ * Instead we use a 'copy' which is updated from sched_move_task() while | |
+ * holding both task_struct::pi_lock and rq::lock. | |
*/ | |
static inline struct task_group *task_group(struct task_struct *p) | |
{ | |
- struct task_group *tg; | |
- struct cgroup_subsys_state *css; | |
- | |
- css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | |
- lockdep_is_held(&p->pi_lock) || | |
- lockdep_is_held(&task_rq(p)->lock)); | |
- tg = container_of(css, struct task_group, css); | |
- | |
- return autogroup_task_group(p, tg); | |
+ return p->sched_task_group; | |
} | |
static inline bool task_notify_on_migrate(struct task_struct *p) | |
@@ -954,8 +951,6 @@ static inline u64 sched_avg_period(void) | |
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | |
} | |
-void calc_load_account_idle(struct rq *this_rq); | |
- | |
#ifdef CONFIG_SCHED_HRTICK | |
/* | |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c | |
index 7b386e8..da5eb5b 100644 | |
--- a/kernel/sched/stop_task.c | |
+++ b/kernel/sched/stop_task.c | |
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |
{ | |
struct task_struct *stop = rq->stop; | |
- if (stop && stop->on_rq) | |
+ if (stop && stop->on_rq) { | |
+ stop->se.exec_start = rq->clock_task; | |
return stop; | |
+ } | |
return NULL; | |
} | |
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq) | |
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | |
{ | |
+ struct task_struct *curr = rq->curr; | |
+ u64 delta_exec; | |
+ | |
+ delta_exec = rq->clock_task - curr->se.exec_start; | |
+ if (unlikely((s64)delta_exec < 0)) | |
+ delta_exec = 0; | |
+ | |
+ schedstat_set(curr->se.statistics.exec_max, | |
+ max(curr->se.statistics.exec_max, delta_exec)); | |
+ | |
+ curr->se.sum_exec_runtime += delta_exec; | |
+ account_group_exec_runtime(curr, delta_exec); | |
+ | |
+ curr->se.exec_start = rq->clock_task; | |
+ cpuacct_charge(curr, delta_exec); | |
} | |
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | |
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | |
static void set_curr_task_stop(struct rq *rq) | |
{ | |
+ struct task_struct *stop = rq->stop; | |
+ | |
+ stop->se.exec_start = rq->clock_task; | |
} | |
static void switched_to_stop(struct rq *rq, struct task_struct *p) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment