invisiblek · April 6, 2015 17:29
diff --git a/foo.diff b/foo.diff
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
 index c731d4d..5da13d7 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -1123,7 +1123,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
 	 *
 	 * sched_move_task() holds both and thus holding either pins the cgroup,
 -	 * see set_task_rq().
 +	 * see task_group().
 	 *
 	 * Furthermore, all task_rq users should acquire both locks, see
 	 * task_rq_lock().
 @@ -1611,8 +1611,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
 	smp_wmb();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 -	src_cpu = task_cpu(p);
 -	cpu = src_cpu;
 +	src_cpu = cpu = task_cpu(p);
 
 	if (!(p->state & state))
 		goto out;
 @@ -1654,6 +1653,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		p->sched_class->task_waking(p);
 
 	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
 +
 +	/* Refresh src_cpu as it could have changed since we last read it */
 +	src_cpu = task_cpu(p);
 	if (src_cpu != cpu) {
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
 @@ -1727,7 +1729,8 @@ out:
  */
 int wake_up_process(struct task_struct *p)
 {
 -	return try_to_wake_up(p, TASK_ALL, 0);
 +	WARN_ON(task_is_stopped_or_traced(p));
 +	return try_to_wake_up(p, TASK_NORMAL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 
 @@ -2201,11 +2204,73 @@ unsigned long this_cpu_load(void)
 }
 
 
 +/*
 + * Global load-average calculations
 + *
 + * We take a distributed and async approach to calculating the global load-avg
 + * in order to minimize overhead.
 + *
 + * The global load average is an exponentially decaying average of nr_running +
 + * nr_uninterruptible.
 + *
 + * Once every LOAD_FREQ:
 + *
 + *   nr_active = 0;
 + *   for_each_possible_cpu(cpu)
 + *   	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
 + *
 + *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
 + *
 + * Due to a number of reasons the above turns in the mess below:
 + *
 + *  - for_each_possible_cpu() is prohibitively expensive on machines with
 + *    serious number of cpus, therefore we need to take a distributed approach
 + *    to calculating nr_active.
 + *
 + *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
 + *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
 + *
 + *    So assuming nr_active := 0 when we start out -- true per definition, we
 + *    can simply take per-cpu deltas and fold those into a global accumulate
 + *    to obtain the same result. See calc_load_fold_active().
 + *
 + *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
 + *    across the machine, we assume 10 ticks is sufficient time for every
 + *    cpu to have completed this task.
 + *
 + *    This places an upper-bound on the IRQ-off latency of the machine. Then
 + *    again, being late doesn't loose the delta, just wrecks the sample.
 + *
 + *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
 + *    this would add another cross-cpu cacheline miss and atomic operation
 + *    to the wakeup path. Instead we increment on whatever cpu the task ran
 + *    when it went into uninterruptible state and decrement on whatever cpu
 + *    did the wakeup. This means that only the sum of nr_uninterruptible over
 + *    all cpus yields the correct result.
 + *
 + *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
 + */
 +
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
 unsigned long avenrun[3];
 -EXPORT_SYMBOL(avenrun);
 +EXPORT_SYMBOL(avenrun); /* should be removed */
 +
 +/**
 + * get_avenrun - get the load average array
 + * @loads:	pointer to dest load array
 + * @offset:	offset to add
 + * @shift:	shift count to shift the result left
 + *
 + * These values are estimates at best, so no need for locking.
 + */
 +void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 +{
 +	loads[0] = (avenrun[0] + offset) << shift;
 +	loads[1] = (avenrun[1] + offset) << shift;
 +	loads[2] = (avenrun[2] + offset) << shift;
 +}
 
 static long calc_load_fold_active(struct rq *this_rq)
 {
 @@ -2222,6 +2287,9 @@ static long calc_load_fold_active(struct rq *this_rq)
 	return delta;
 }
 
 +/*
 + * a1 = a0 * e + a * (1 - e)
 + */
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
 @@ -2233,30 +2301,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 
 #ifdef CONFIG_NO_HZ
 /*
 - * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
 + * Handle NO_HZ for the global load-average.
 + *
 + * Since the above described distributed algorithm to compute the global
 + * load-average relies on per-cpu sampling from the tick, it is affected by
 + * NO_HZ.
 + *
 + * The basic idea is to fold the nr_active delta into a global idle-delta upon
 + * entering NO_HZ state such that we can include this as an 'extra' cpu delta
 + * when we read the global state.
 + *
 + * Obviously reality has to ruin such a delightfully simple scheme:
 + *
 + *  - When we go NO_HZ idle during the window, we can negate our sample
 + *    contribution, causing under-accounting.
 + *
 + *    We avoid this by keeping two idle-delta counters and flipping them
 + *    when the window starts, thus separating old and new NO_HZ load.
 + *
 + *    The only trick is the slight shift in index flip for read vs write.
 + *
 + *        0s            5s            10s           15s
 + *          +10           +10           +10           +10
 + *        |-|-----------|-|-----------|-|-----------|-|
 + *    r:0 0 1           1 0           0 1           1 0
 + *    w:0 1 1           0 0           1 1           0 0
 + *
 + *    This ensures we'll fold the old idle contribution in this window while
 + *    accumlating the new one.
 + *
 + *  - When we wake up from NO_HZ idle during the window, we push up our
 + *    contribution, since we effectively move our sample point to a known
 + *    busy state.
 + *
 + *    This is solved by pushing the window forward, and thus skipping the
 + *    sample, for this cpu (effectively using the idle-delta for this cpu which
 + *    was in effect at the time the window opened). This also solves the issue
 + *    of having to deal with a cpu having been in NOHZ idle for multiple
 + *    LOAD_FREQ intervals.
  *
  * When making the ILB scale, we should try to pull this in as well.
  */
 -static atomic_long_t calc_load_tasks_idle;
 +static atomic_long_t calc_load_idle[2];
 +static int calc_load_idx;
 +
 +static inline int calc_load_write_idx(void)
 +{
 +	int idx = calc_load_idx;
 +
 +	/*
 +	 * See calc_global_nohz(), if we observe the new index, we also
 +	 * need to observe the new update time.
 +	 */
 +	smp_rmb();
 +
 +	/*
 +	 * If the folding window started, make sure we start writing in the
 +	 * next idle-delta.
 +	 */
 +	if (!time_before(jiffies, calc_load_update))
 +		idx++;
 +
 +	return idx & 1;
 +}
 
 -void calc_load_account_idle(struct rq *this_rq)
 +static inline int calc_load_read_idx(void)
 {
 +	return calc_load_idx & 1;
 +}
 +
 +void calc_load_enter_idle(void)
 +{
 +	struct rq *this_rq = this_rq();
 	long delta;
 
 +	/*
 +	 * We're going into NOHZ mode, if there's any pending delta, fold it
 +	 * into the pending idle delta.
 +	 */
 	delta = calc_load_fold_active(this_rq);
 -	if (delta)
 -		atomic_long_add(delta, &calc_load_tasks_idle);
 +	if (delta) {
 +		int idx = calc_load_write_idx();
 +		atomic_long_add(delta, &calc_load_idle[idx]);
 +	}
 }
 
 -static long calc_load_fold_idle(void)
 +void calc_load_exit_idle(void)
 {
 -	long delta = 0;
 +	struct rq *this_rq = this_rq();
 
 	/*
 -	 * Its got a race, we don't care...
 +	 * If we're still before the sample window, we're done.
 	 */
 -	if (atomic_long_read(&calc_load_tasks_idle))
 -		delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
 +	if (time_before(jiffies, this_rq->calc_load_update))
 +		return;
 +
 +	/*
 +	 * We woke inside or after the sample window, this means we're already
 +	 * accounted through the nohz accounting, so skip the entire deal and
 +	 * sync up for the next window.
 +	 */
 +	this_rq->calc_load_update = calc_load_update;
 +	if (time_before(jiffies, this_rq->calc_load_update + 10))
 +		this_rq->calc_load_update += LOAD_FREQ;
 +}
 +
 +static long calc_load_fold_idle(void)
 +{
 +	int idx = calc_load_read_idx();
 +	long delta = 0;
 +
 +	if (atomic_long_read(&calc_load_idle[idx]))
 +		delta = atomic_long_xchg(&calc_load_idle[idx], 0);
 
 	return delta;
 }
 @@ -2342,66 +2498,39 @@ static void calc_global_nohz(void)
 {
 	long delta, active, n;
 
 -	/*
 -	 * If we crossed a calc_load_update boundary, make sure to fold
 -	 * any pending idle changes, the respective CPUs might have
 -	 * missed the tick driven calc_load_account_active() update
 -	 * due to NO_HZ.
 -	 */
 -	delta = calc_load_fold_idle();
 -	if (delta)
 -		atomic_long_add(delta, &calc_load_tasks);
 -
 -	/*
 -	 * It could be the one fold was all it took, we done!
 -	 */
 -	if (time_before(jiffies, calc_load_update + 10))
 -		return;
 -
 -	/*
 -	 * Catch-up, fold however many we are behind still
 -	 */
 -	delta = jiffies - calc_load_update - 10;
 -	n = 1 + (delta / LOAD_FREQ);
 +	if (!time_before(jiffies, calc_load_update + 10)) {
 +		/*
 +		 * Catch-up, fold however many we are behind still
 +		 */
 +		delta = jiffies - calc_load_update - 10;
 +		n = 1 + (delta / LOAD_FREQ);
 
 -	active = atomic_long_read(&calc_load_tasks);
 -	active = active > 0 ? active * FIXED_1 : 0;
 +		active = atomic_long_read(&calc_load_tasks);
 +		active = active > 0 ? active * FIXED_1 : 0;
 
 -	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
 -	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
 -	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
 +		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
 +		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
 +		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
 
 -	calc_load_update += n * LOAD_FREQ;
 -}
 -#else
 -void calc_load_account_idle(struct rq *this_rq)
 -{
 -}
 +		calc_load_update += n * LOAD_FREQ;
 +	}
 
 -static inline long calc_load_fold_idle(void)
 -{
 -	return 0;
 +	/*
 +	 * Flip the idle index...
 +	 *
 +	 * Make sure we first write the new time then flip the index, so that
 +	 * calc_load_write_idx() will see the new time when it reads the new
 +	 * index, this avoids a double flip messing things up.
 +	 */
 +	smp_wmb();
 +	calc_load_idx++;
 }
 +#else /* !CONFIG_NO_HZ */
 
 -static void calc_global_nohz(void)
 -{
 -}
 -#endif
 +static inline long calc_load_fold_idle(void) { return 0; }
 +static inline void calc_global_nohz(void) { }
 
 -/**
 - * get_avenrun - get the load average array
 - * @loads:	pointer to dest load array
 - * @offset:	offset to add
 - * @shift:	shift count to shift the result left
 - *
 - * These values are estimates at best, so no need for locking.
 - */
 -void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 -{
 -	loads[0] = (avenrun[0] + offset) << shift;
 -	loads[1] = (avenrun[1] + offset) << shift;
 -	loads[2] = (avenrun[2] + offset) << shift;
 -}
 +#endif /* CONFIG_NO_HZ */
 
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
 @@ -2409,11 +2538,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
  */
 void calc_global_load(unsigned long ticks)
 {
 -	long active;
 +	long active, delta;
 
 	if (time_before(jiffies, calc_load_update + 10))
 		return;
 
 +	/*
 +	 * Fold the 'old' idle-delta to include all NO_HZ cpus.
 +	 */
 +	delta = calc_load_fold_idle();
 +	if (delta)
 +		atomic_long_add(delta, &calc_load_tasks);
 +
 	active = atomic_long_read(&calc_load_tasks);
 	active = active > 0 ? active * FIXED_1 : 0;
 
 @@ -2424,12 +2560,7 @@ void calc_global_load(unsigned long ticks)
 	calc_load_update += LOAD_FREQ;
 
 	/*
 -	 * Account one period with whatever state we found before
 -	 * folding in the nohz state and ageing the entire idle period.
 -	 *
 -	 * This avoids loosing a sample when we go idle between 
 -	 * calc_load_account_active() (10 ticks ago) and now and thus
 -	 * under-accounting.
 +	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
 	 */
 	calc_global_nohz();
 }
 @@ -2446,7 +2577,6 @@ static void calc_load_account_active(struct rq *this_rq)
 		return;
 
 	delta  = calc_load_fold_active(this_rq);
 -	delta += calc_load_fold_idle();
 	if (delta)
 		atomic_long_add(delta, &calc_load_tasks);
 
 @@ -2454,6 +2584,10 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 
 /*
 + * End of global load-average stuff
 + */
 +
 +/*
  * The exact cpuload at various idx values, calculated at every tick would be
  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
  *
 @@ -2992,6 +3126,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 # define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
 #endif
 
 +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
 +{
 +	u64 temp = (__force u64) rtime;
 +
 +	temp *= (__force u64) utime;
 +
 +	if (sizeof(cputime_t) == 4)
 +		temp = div_u64(temp, (__force u32) total);
 +	else
 +		temp = div64_u64(temp, (__force u64) total);
 +
 +	return (__force cputime_t) temp;
 +}
 +
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	cputime_t rtime, utime = p->utime, total = utime + p->stime;
 @@ -3001,13 +3149,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	 */
 	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
 
 -	if (total) {
 -		u64 temp = (__force u64) rtime;
 -
 -		temp *= (__force u64) utime;
 -		do_div(temp, (__force u32) total);
 -		utime = (__force cputime_t) temp;
 -	} else
 +	if (total)
 +		utime = scale_utime(utime, rtime, total);
 +	else
 		utime = rtime;
 
 	/*
 @@ -3034,13 +3178,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	total = cputime.utime + cputime.stime;
 	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
 
 -	if (total) {
 -		u64 temp = (__force u64) rtime;
 -
 -		temp *= (__force u64) cputime.utime;
 -		do_div(temp, (__force u32) total);
 -		utime = (__force cputime_t) temp;
 -	} else
 +	if (total)
 +		utime = scale_utime(cputime.utime, rtime, total);
 +	else
 		utime = rtime;
 
 	sig->prev_utime = max(sig->prev_utime, utime);
 @@ -4413,13 +4553,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	struct task_struct *p;
 	int retval;
 
 -	get_online_cpus();
 	rcu_read_lock();
 
 	p = find_process_by_pid(pid);
 	if (!p) {
 		rcu_read_unlock();
 -		put_online_cpus();
 		return -ESRCH;
 	}
 
 @@ -4466,7 +4604,6 @@ out_free_cpus_allowed:
 	free_cpumask_var(cpus_allowed);
 out_put_task:
 	put_task_struct(p);
 -	put_online_cpus();
 	return retval;
 }
 
 @@ -4509,7 +4646,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 	unsigned long flags;
 	int retval;
 
 -	get_online_cpus();
 	rcu_read_lock();
 
 	retval = -ESRCH;
 @@ -4522,12 +4658,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 		goto out_unlock;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 -	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
 +	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 out_unlock:
 	rcu_read_unlock();
 -	put_online_cpus();
 
 	return retval;
 }
 @@ -5512,7 +5647,6 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 -	case CPU_STARTING:
 	case CPU_DOWN_FAILED:
 		set_cpu_active((long)hcpu, true);
 		return NOTIFY_OK;
 @@ -6302,11 +6436,8 @@ int sched_domain_level_max;
 
 static int __init setup_relax_domain_level(char *str)
 {
 -	unsigned long val;
 -
 -	val = simple_strtoul(str, NULL, 0);
 -	if (val < sched_domain_level_max)
 -		default_relax_domain_level = val;
 +	if (kstrtoint(str, 0, &default_relax_domain_level))
 +		pr_warn("Unable to set relax_domain_level\n");
 
 	return 1;
 }
 @@ -6511,7 +6642,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 	if (!sd)
 		return child;
 
 -	set_domain_attribute(sd, attr);
 	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
 	if (child) {
 		sd->level = child->level + 1;
 @@ -6519,6 +6649,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		child->parent = sd;
 	}
 	sd->child = child;
 +	set_domain_attribute(sd, attr);
 
 	return sd;
 }
 @@ -6875,34 +7006,66 @@ int __init sched_create_sysfs_power_savings_entries(struct device *dev)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
 +static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
 +
 /*
  * Update cpusets according to cpu_active mask.  If cpusets are
  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
  * around partition_sched_domains().
 + *
 + * If we come here as part of a suspend/resume, don't touch cpusets because we
 + * want to restore it back to its original state upon resume anyway.
  */
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 			     void *hcpu)
 {
 -	switch (action & ~CPU_TASKS_FROZEN) {
 +	switch (action) {
 +	case CPU_ONLINE_FROZEN:
 +	case CPU_DOWN_FAILED_FROZEN:
 +
 +		/*
 +		 * num_cpus_frozen tracks how many CPUs are involved in suspend
 +		 * resume sequence. As long as this is not the last online
 +		 * operation in the resume sequence, just build a single sched
 +		 * domain, ignoring cpusets.
 +		 */
 +		num_cpus_frozen--;
 +		if (likely(num_cpus_frozen)) {
 +			partition_sched_domains(1, NULL, NULL);
 +			break;
 +		}
 +
 +		/*
 +		 * This is the last CPU online operation. So fall through and
 +		 * restore the original sched domains by considering the
 +		 * cpuset configurations.
 +		 */
 +
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		cpuset_update_active_cpus();
 -		return NOTIFY_OK;
 +		break;
 	default:
 		return NOTIFY_DONE;
 	}
 +	return NOTIFY_OK;
 }
 
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 			       void *hcpu)
 {
 -	switch (action & ~CPU_TASKS_FROZEN) {
 +	switch (action) {
 	case CPU_DOWN_PREPARE:
 		cpuset_update_active_cpus();
 -		return NOTIFY_OK;
 +		break;
 +	case CPU_DOWN_PREPARE_FROZEN:
 +		num_cpus_frozen++;
 +		partition_sched_domains(1, NULL, NULL);
 +		break;
 	default:
 		return NOTIFY_DONE;
 	}
 +	return NOTIFY_OK;
 }
 
 void __init sched_init_smp(void)
 @@ -6912,14 +7075,12 @@ void __init sched_init_smp(void)
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
 -	get_online_cpus();
 	mutex_lock(&sched_domains_mutex);
 	init_sched_domains(cpu_active_mask);
 	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
 	if (cpumask_empty(non_isolated_cpus))
 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 -	put_online_cpus();
 
 	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
 	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
 @@ -7377,6 +7538,7 @@ void sched_destroy_group(struct task_group *tg)
  */
 void sched_move_task(struct task_struct *tsk)
 {
 +	struct task_group *tg;
 	int on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
 @@ -7391,6 +7553,12 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
 +	tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
 +				lockdep_is_held(&tsk->sighand->siglock)),
 +			  struct task_group, css);
 +	tg = autogroup_task_group(tsk, tg);
 +	tsk->sched_task_group = tg;
 +
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_move_group)
 		tsk->sched_class->task_move_group(tsk, on_rq);
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
 index 08497b0..62f8598 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -5231,11 +5231,15 @@ static void task_fork_fair(struct task_struct *p)
 	cfs_rq = task_cfs_rq(current);
 	curr = cfs_rq->curr;
 
 -	if (unlikely(task_cpu(p) != this_cpu)) {
 -		rcu_read_lock();
 -		__set_task_cpu(p, this_cpu);
 -		rcu_read_unlock();
 -	}
 +	/*
 +	 * Not only the cpu but also the task_group of the parent might have
 +	 * been changed after parent->se.parent,cfs_rq were copied to
 +	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
 +	 * of child point to valid ones.
 +	 */
 +	rcu_read_lock();
 +	__set_task_cpu(p, this_cpu);
 +	rcu_read_unlock();
 
 	update_curr(cfs_rq);
 
 @@ -5552,7 +5556,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 	 * idle runqueue:
 	 */
 	if (rq->cfs.load.weight)
 -		rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
 +		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
 
 	return rr_interval;
 }
 diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
 index 91b4c95..fdf7522 100644
 --- a/kernel/sched/idle_task.c
 +++ b/kernel/sched/idle_task.c
 @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
 -	calc_load_account_idle(rq);
 	return rq->idle;
 }
 
 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
 index be427c5..3d4b1e2 100644
 --- a/kernel/sched/rt.c
 +++ b/kernel/sched/rt.c
 @@ -560,7 +560,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 static int do_balance_runtime(struct rt_rq *rt_rq)
 {
 	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 -	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 +	struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
 	int i, weight, more = 0;
 	u64 rt_period;
 
 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
 index 34fe64f..9b8ed6b 100644
 --- a/kernel/sched/sched.h
 +++ b/kernel/sched/sched.h
 @@ -538,22 +538,19 @@ DECLARE_PER_CPU(int, sd_llc_id);
 /*
  * Return the group to which this tasks belongs.
  *
 - * We use task_subsys_state_check() and extend the RCU verification with
 - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
 - * task it moves into the cgroup. Therefore by holding either of those locks,
 - * we pin the task to the current cgroup.
 + * We cannot use task_subsys_state() and friends because the cgroup
 + * subsystem changes that value before the cgroup_subsys::attach() method
 + * is called, therefore we cannot pin it and might observe the wrong value.
 + *
 + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
 + * core changes this before calling sched_move_task().
 + *
 + * Instead we use a 'copy' which is updated from sched_move_task() while
 + * holding both task_struct::pi_lock and rq::lock.
  */
 static inline struct task_group *task_group(struct task_struct *p)
 {
 -	struct task_group *tg;
 -	struct cgroup_subsys_state *css;
 -
 -	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 -			lockdep_is_held(&p->pi_lock) ||
 -			lockdep_is_held(&task_rq(p)->lock));
 -	tg = container_of(css, struct task_group, css);
 -
 -	return autogroup_task_group(p, tg);
 +	return p->sched_task_group;
 }
 
 static inline bool task_notify_on_migrate(struct task_struct *p)
 @@ -954,8 +951,6 @@ static inline u64 sched_avg_period(void)
 	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 }
 
 -void calc_load_account_idle(struct rq *this_rq);
 -
 #ifdef CONFIG_SCHED_HRTICK
 
 /*
 diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
 index 7b386e8..da5eb5b 100644
 --- a/kernel/sched/stop_task.c
 +++ b/kernel/sched/stop_task.c
 @@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
 {
 	struct task_struct *stop = rq->stop;
 
 -	if (stop && stop->on_rq)
 +	if (stop && stop->on_rq) {
 +		stop->se.exec_start = rq->clock_task;
 		return stop;
 +	}
 
 	return NULL;
 }
 @@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
 
 static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 {
 +	struct task_struct *curr = rq->curr;
 +	u64 delta_exec;
 +
 +	delta_exec = rq->clock_task - curr->se.exec_start;
 +	if (unlikely((s64)delta_exec < 0))
 +		delta_exec = 0;
 +
 +	schedstat_set(curr->se.statistics.exec_max,
 +			max(curr->se.statistics.exec_max, delta_exec));
 +
 +	curr->se.sum_exec_runtime += delta_exec;
 +	account_group_exec_runtime(curr, delta_exec);
 +
 +	curr->se.exec_start = rq->clock_task;
 +	cpuacct_charge(curr, delta_exec);
 }
 
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
 @@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
 
 static void set_curr_task_stop(struct rq *rq)
 {
 +	struct task_struct *stop = rq->stop;
 +
 +	stop->se.exec_start = rq->clock_task;
 }
 
 static void switched_to_stop(struct rq *rq, struct task_struct *p)