diff options
| author | Meizu OpenSource <patchwork@meizu.com> | 2016-08-15 10:19:42 +0800 |
|---|---|---|
| committer | Meizu OpenSource <patchwork@meizu.com> | 2016-08-15 10:19:42 +0800 |
| commit | d2e1446d81725c351dc73a03b397ce043fb18452 (patch) | |
| tree | 4dbc616b7f92aea39cd697a9084205ddb805e344 /kernel/sched | |
first commit
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/Makefile | 19 | ||||
| -rw-r--r-- | kernel/sched/auto_group.c | 260 | ||||
| -rw-r--r-- | kernel/sched/auto_group.h | 64 | ||||
| -rw-r--r-- | kernel/sched/clock.c | 376 | ||||
| -rw-r--r-- | kernel/sched/core.c | 8605 | ||||
| -rw-r--r-- | kernel/sched/cpuacct.c | 296 | ||||
| -rw-r--r-- | kernel/sched/cpuacct.h | 17 | ||||
| -rw-r--r-- | kernel/sched/cpupri.c | 241 | ||||
| -rw-r--r-- | kernel/sched/cpupri.h | 34 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 854 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 1288 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 10321 | ||||
| -rw-r--r-- | kernel/sched/features.h | 72 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 115 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 2993 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 1427 | ||||
| -rw-r--r-- | kernel/sched/stats.c | 145 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 231 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 143 |
19 files changed, 27501 insertions, 0 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile new file mode 100644 index 000000000..deaf90e4a --- /dev/null +++ b/kernel/sched/Makefile @@ -0,0 +1,19 @@ +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_clock.o = -pg +endif + +ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) +# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is +# needed for x86 only. Why this used to be enabled for all architectures is beyond +# me. I suspect most platforms don't need this, but until we know that for sure +# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k +# to get a correct value for the wait-channel (WCHAN in ps). --davidm +CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer +endif + +obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-$(CONFIG_SMP) += cpupri.o +obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o +obj-$(CONFIG_SCHEDSTATS) += stats.o +obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c new file mode 100644 index 000000000..4a073539c --- /dev/null +++ b/kernel/sched/auto_group.c @@ -0,0 +1,260 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +#include "sched.h" + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/utsname.h> +#include <linux/security.h> +#include <linux/export.h> + +unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; +static struct autogroup autogroup_default; +static atomic_t autogroup_seq_nr; + +void __init autogroup_init(struct task_struct *init_task) +{ + autogroup_default.tg = &root_task_group; + kref_init(&autogroup_default.kref); + init_rwsem(&autogroup_default.lock); + init_task->signal->autogroup = &autogroup_default; +} + +void autogroup_free(struct task_group *tg) +{ + kfree(tg->autogroup); +} + +static inline void autogroup_destroy(struct kref *kref) +{ + struct autogroup *ag = container_of(kref, struct autogroup, kref); + +#ifdef CONFIG_RT_GROUP_SCHED + /* We've redirected RT tasks to the root task group... */ + ag->tg->rt_se = NULL; + ag->tg->rt_rq = NULL; +#endif + sched_offline_group(ag->tg); + sched_destroy_group(ag->tg); +} + +static inline void autogroup_kref_put(struct autogroup *ag) +{ + kref_put(&ag->kref, autogroup_destroy); +} + +static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) +{ + kref_get(&ag->kref); + return ag; +} + +static inline struct autogroup *autogroup_task_get(struct task_struct *p) +{ + struct autogroup *ag; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return autogroup_kref_get(&autogroup_default); + + ag = autogroup_kref_get(p->signal->autogroup); + unlock_task_sighand(p, &flags); + + return ag; +} + +static inline struct autogroup *autogroup_create(void) +{ + struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); + struct task_group *tg; + + if (!ag) + goto out_fail; + + tg = sched_create_group(&root_task_group); + + if (IS_ERR(tg)) + goto out_free; + + kref_init(&ag->kref); + init_rwsem(&ag->lock); + ag->id = atomic_inc_return(&autogroup_seq_nr); + ag->tg = tg; +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Autogroup RT tasks are redirected to the root task group + * so we don't have to move tasks around upon policy change, + * or flail around trying to allocate bandwidth on the fly. + * A bandwidth exception in __sched_setscheduler() allows + * the policy change to proceed. Thereafter, task_group() + * returns &root_task_group, so zero bandwidth is required. + */ + free_rt_sched_group(tg); + tg->rt_se = root_task_group.rt_se; + tg->rt_rq = root_task_group.rt_rq; +#endif + tg->autogroup = ag; + + sched_online_group(tg, &root_task_group); + return ag; + +out_free: + kfree(ag); +out_fail: + if (printk_ratelimit()) { + printk(KERN_WARNING "autogroup_create: %s failure.\n", + ag ? "sched_create_group()" : "kmalloc()"); + } + + return autogroup_kref_get(&autogroup_default); +} + +bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) +{ + if (tg != &root_task_group) + return false; + + if (p->sched_class != &fair_sched_class) + return false; + + /* + * We can only assume the task group can't go away on us if + * autogroup_move_group() can see us on ->thread_group list. + */ + if (p->flags & PF_EXITING) + return false; + + return true; +} + +static void +autogroup_move_group(struct task_struct *p, struct autogroup *ag) +{ + struct autogroup *prev; + struct task_struct *t; + unsigned long flags; + + BUG_ON(!lock_task_sighand(p, &flags)); + + prev = p->signal->autogroup; + if (prev == ag) { + unlock_task_sighand(p, &flags); + return; + } + + p->signal->autogroup = autogroup_kref_get(ag); + + if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + goto out; + + t = p; + do { + sched_move_task(t); + } while_each_thread(p, t); + +out: + unlock_task_sighand(p, &flags); + autogroup_kref_put(prev); +} + +/* Allocates GFP_KERNEL, cannot be called under any spinlock */ +void sched_autogroup_create_attach(struct task_struct *p) +{ + struct autogroup *ag = autogroup_create(); + + autogroup_move_group(p, ag); + /* drop extra reference added by autogroup_create() */ + autogroup_kref_put(ag); +} +EXPORT_SYMBOL(sched_autogroup_create_attach); + +/* Cannot be called under siglock. Currently has no users */ +void sched_autogroup_detach(struct task_struct *p) +{ + autogroup_move_group(p, &autogroup_default); +} +EXPORT_SYMBOL(sched_autogroup_detach); + +void sched_autogroup_fork(struct signal_struct *sig) +{ + sig->autogroup = autogroup_task_get(current); +} + +void sched_autogroup_exit(struct signal_struct *sig) +{ + autogroup_kref_put(sig->autogroup); +} + +static int __init setup_autogroup(char *str) +{ + sysctl_sched_autogroup_enabled = 0; + + return 1; +} + +__setup("noautogroup", setup_autogroup); + +#ifdef CONFIG_PROC_FS + +int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) +{ + static unsigned long next = INITIAL_JIFFIES; + struct autogroup *ag; + int err; + + if (nice < -20 || nice > 19) + return -EINVAL; + + err = security_task_setnice(current, nice); + if (err) + return err; + + if (nice < 0 && !can_nice(current, nice)) + return -EPERM; + + /* this is a heavy operation taking global locks.. */ + if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) + return -EAGAIN; + + next = HZ / 10 + jiffies; + ag = autogroup_task_get(p); + + down_write(&ag->lock); + err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); + if (!err) + ag->nice = nice; + up_write(&ag->lock); + + autogroup_kref_put(ag); + + return err; +} + +void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) +{ + struct autogroup *ag = autogroup_task_get(p); + + if (!task_group_is_autogroup(ag->tg)) + goto out; + + down_read(&ag->lock); + seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); + up_read(&ag->lock); + +out: + autogroup_kref_put(ag); +} +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SCHED_DEBUG +int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + if (!task_group_is_autogroup(tg)) + return 0; + + return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); +} +#endif /* CONFIG_SCHED_DEBUG */ + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h new file mode 100644 index 000000000..8bd047142 --- /dev/null +++ b/kernel/sched/auto_group.h @@ -0,0 +1,64 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +#include <linux/kref.h> +#include <linux/rwsem.h> + +struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +extern void autogroup_init(struct task_struct *init_task); +extern void autogroup_free(struct task_group *tg); + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +extern int autogroup_path(struct task_group *tg, char *buf, int buflen); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} +#endif + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c new file mode 100644 index 000000000..c3ae14464 --- /dev/null +++ b/kernel/sched/clock.c @@ -0,0 +1,376 @@ +/* + * sched_clock for unstable cpu clocks + * + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * Updates and enhancements: + * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> + * + * Based on code by: + * Ingo Molnar <mingo@redhat.com> + * Guillaume Chazarain <guichaz@gmail.com> + * + * + * What: + * + * cpu_clock(i) provides a fast (execution time) high resolution + * clock with bounded drift between CPUs. The value of cpu_clock(i) + * is monotonic for constant i. The timestamp returned is in nanoseconds. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + * + * There is no strict promise about the base, although it tends to start + * at 0 on boot (but people really shouldn't rely on that). + * + * cpu_clock(i) -- can be used from any context, including NMI. + * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) + * local_clock() -- is cpu_clock() on the current cpu. + * + * How: + * + * The implementation either uses sched_clock() when + * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the + * sched_clock() is assumed to provide these properties (mostly it means + * the architecture provides a globally synchronized highres time source). + * + * Otherwise it tries to create a semi stable clock from a mixture of other + * clocks, including: + * + * - GTOD (clock monotomic) + * - sched_clock() + * - explicit idle events + * + * We use GTOD as base and use sched_clock() deltas to improve resolution. The + * deltas are filtered to provide monotonicity and keeping it within an + * expected window. + * + * Furthermore, explicit sleep and wakeup hooks allow us to account for time + * that is otherwise invisible (TSC gets stopped). + * + * + * Notes: + * + * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things + * like cpufreq interrupts that can change the base clock (TSC) multiplier + * and cause funny jumps in time -- although the filtering provided by + * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it + * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on + * sched_clock(). + */ +#include <linux/spinlock.h> +#include <linux/hardirq.h> +#include <linux/export.h> +#include <linux/percpu.h> +#include <linux/ktime.h> +#include <linux/sched.h> + +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)(jiffies - INITIAL_JIFFIES) + * (NSEC_PER_SEC / HZ); +} +EXPORT_SYMBOL_GPL(sched_clock); + +__read_mostly int sched_clock_running; + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__read_mostly int sched_clock_stable; + +struct sched_clock_data { + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) +{ + return &__get_cpu_var(sched_clock_data); +} + +static inline struct sched_clock_data *cpu_sdc(int cpu) +{ + return &per_cpu(sched_clock_data, cpu); +} + +void sched_clock_init(void) +{ + u64 ktime_now = ktime_to_ns(ktime_get()); + int cpu; + + for_each_possible_cpu(cpu) { + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->tick_raw = 0; + scd->tick_gtod = ktime_now; + scd->clock = ktime_now; + } + + sched_clock_running = 1; +} + +/* + * min, max except they take wrapping into account + */ + +static inline u64 wrap_min(u64 x, u64 y) +{ + return (s64)(x - y) < 0 ? x : y; +} + +static inline u64 wrap_max(u64 x, u64 y) +{ + return (s64)(x - y) > 0 ? x : y; +} + +/* + * update the percpu scd from the raw @now value + * + * - filter out backward motion + * - use the GTOD tick value to create a window to filter crazy TSC values + */ +static u64 sched_clock_local(struct sched_clock_data *scd) +{ + u64 now, clock, old_clock, min_clock, max_clock; + s64 delta; + +again: + now = sched_clock(); + delta = now - scd->tick_raw; + if (unlikely(delta < 0)) + delta = 0; + + old_clock = scd->clock; + + /* + * scd->clock = clamp(scd->tick_gtod + delta, + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); + */ + + clock = scd->tick_gtod + delta; + min_clock = wrap_max(scd->tick_gtod, old_clock); + max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); + + clock = wrap_max(clock, min_clock); + clock = wrap_min(clock, max_clock); + + if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) + goto again; + + return clock; +} + +static u64 sched_clock_remote(struct sched_clock_data *scd) +{ + struct sched_clock_data *my_scd = this_scd(); + u64 this_clock, remote_clock; + u64 *ptr, old_val, val; + +#if BITS_PER_LONG != 64 +again: + /* + * Careful here: The local and the remote clock values need to + * be read out atomic as we need to compare the values and + * then update either the local or the remote side. So the + * cmpxchg64 below only protects one readout. + * + * We must reread via sched_clock_local() in the retry case on + * 32bit as an NMI could use sched_clock_local() via the + * tracer and hit between the readout of + * the low32bit and the high 32bit portion. + */ + this_clock = sched_clock_local(my_scd); + /* + * We must enforce atomic readout on 32bit, otherwise the + * update on the remote cpu can hit inbetween the readout of + * the low32bit and the high 32bit portion. + */ + remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else + /* + * On 64bit the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32bit dance. + */ + sched_clock_local(my_scd); +again: + this_clock = my_scd->clock; + remote_clock = scd->clock; +#endif + + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely((s64)(remote_clock - this_clock) < 0)) { + ptr = &scd->clock; + old_val = remote_clock; + val = this_clock; + } else { + /* + * Should be rare, but possible: + */ + ptr = &my_scd->clock; + old_val = this_clock; + val = remote_clock; + } + + if (cmpxchg64(ptr, old_val, val) != old_val) + goto again; + + return val; +} + +/* + * Similar to cpu_clock(), but requires local IRQs to be disabled. + * + * See cpu_clock(). + */ +u64 sched_clock_cpu(int cpu) +{ + struct sched_clock_data *scd; + u64 clock; + + WARN_ON_ONCE(!irqs_disabled()); + + if (sched_clock_stable) + return sched_clock(); + + if (unlikely(!sched_clock_running)) + return 0ull; + + scd = cpu_sdc(cpu); + + if (cpu != smp_processor_id()) + clock = sched_clock_remote(scd); + else + clock = sched_clock_local(scd); + + return clock; +} + +void sched_clock_tick(void) +{ + struct sched_clock_data *scd; + u64 now, now_gtod; + + if (sched_clock_stable) + return; + + if (unlikely(!sched_clock_running)) + return; + + WARN_ON_ONCE(!irqs_disabled()); + + scd = this_scd(); + now_gtod = ktime_to_ns(ktime_get()); + now = sched_clock(); + + scd->tick_raw = now; + scd->tick_gtod = now_gtod; + sched_clock_local(scd); +} + +/* + * We are going deep-idle (irqs are disabled): + */ +void sched_clock_idle_sleep_event(void) +{ + sched_clock_cpu(smp_processor_id()); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + if (timekeeping_suspended) + return; + + sched_clock_tick(); + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); + +/* + * As outlined at the top, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +u64 cpu_clock(int cpu) +{ + u64 clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(cpu); + local_irq_restore(flags); + + return clock; +} + +/* + * Similar to cpu_clock() for the current cpu. Time will only be observed + * to be monotonic if care is taken to only compare timestampt taken on the + * same CPU. + * + * See cpu_clock(). + */ +u64 local_clock(void) +{ + u64 clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(smp_processor_id()); + local_irq_restore(flags); + + return clock; +} + +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + +void sched_clock_init(void) +{ + sched_clock_running = 1; +} + +u64 sched_clock_cpu(int cpu) +{ + if (unlikely(!sched_clock_running)) + return 0; + + return sched_clock(); +} + +u64 cpu_clock(int cpu) +{ + return sched_clock_cpu(cpu); +} + +u64 local_clock(void) +{ + return sched_clock_cpu(0); +} + +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + +EXPORT_SYMBOL_GPL(cpu_clock); +EXPORT_SYMBOL_GPL(local_clock); diff --git a/kernel/sched/core.c b/kernel/sched/core.c new file mode 100644 index 000000000..896888038 --- /dev/null +++ b/kernel/sched/core.c @@ -0,0 +1,8605 @@ +/* + * kernel/sched/core.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin + * 2007-04-15 Work begun on replacing all interactivity tuning with a + * fair scheduling design by Con Kolivas. + * 2007-05-05 Load balancing (smp-nice) and other improvements + * by Peter Williams + * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith + * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri + * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, + * Thomas Gleixner, Mike Kravetz + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <linux/uaccess.h> +#include <linux/highmem.h> +#include <asm/mmu_context.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/completion.h> +#include <linux/kernel_stat.h> +#include <linux/debug_locks.h> +#include <linux/perf_event.h> +#include <linux/security.h> +#include <linux/notifier.h> +#include <linux/profile.h> +#include <linux/freezer.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/pid_namespace.h> +#include <linux/smp.h> +#include <linux/threads.h> +#include <linux/timer.h> +#include <linux/rcupdate.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/percpu.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sysctl.h> +#include <linux/syscalls.h> +#include <linux/times.h> +#include <linux/tsacct_kern.h> +#include <linux/kprobes.h> +#include <linux/delayacct.h> +#include <linux/unistd.h> +#include <linux/pagemap.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/debugfs.h> +#include <linux/ctype.h> +#include <linux/ftrace.h> +#include <linux/slab.h> +#include <linux/init_task.h> +#include <linux/binfmts.h> +#include <linux/context_tracking.h> + +#include <asm/switch_to.h> +#include <asm/tlb.h> +#include <asm/irq_regs.h> +#include <asm/mutex.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif + +#include "sched.h" +#include "../workqueue_internal.h" +#include "../smpboot.h" + +#ifdef CONFIG_MT65XX_TRACER +#include "mach/mt_mon.h" +#include "linux/aee.h" +#endif + +#include <linux/mt_sched_mon.h> +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> + +#include <mtlbprof/mtlbprof.h> +#include <mtlbprof/mtlbprof_stat.h> + +#ifdef CONFIG_MT_PRIO_TRACER +# include <linux/prio_tracer.h> +#endif + +void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) +{ + unsigned long delta; + ktime_t soft, hard, now; + + for (;;) { + if (hrtimer_active(period_timer)) + break; + + now = hrtimer_cb_get_time(period_timer); + hrtimer_forward(period_timer, now, period); + + soft = hrtimer_get_softexpires(period_timer); + hard = hrtimer_get_expires(period_timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(period_timer, soft, delta, + HRTIMER_MODE_ABS_PINNED, 0); + } +} + +DEFINE_MUTEX(sched_domains_mutex); +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +static void update_rq_clock_task(struct rq *rq, s64 delta); + +void update_rq_clock(struct rq *rq) +{ + s64 delta; + + if (rq->skip_clock_update > 0) + return; + + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + rq->clock += delta; + update_rq_clock_task(rq, delta); +} + +/* + * Debugging: various feature bits + */ + +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | + +const_debug unsigned int sysctl_sched_features = +#include "features.h" + 0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled) \ + #name , + +static const char * const sched_feat_names[] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE + +#define SCHED_FEAT(name, enabled) \ + jump_label_key__##enabled , + +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ + if (static_key_enabled(&sched_feat_keys[i])) + static_key_slow_dec(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ + if (!static_key_enabled(&sched_feat_keys[i])) + static_key_slow_inc(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + +static int sched_feat_set(char *cmp) +{ + int i; + int neg = 0; + + if (strncmp(cmp, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { + if (neg) { + sysctl_sched_features &= ~(1UL << i); + sched_feat_disable(i); + } else { + sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); + } + break; + } + } + + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + i = sched_feat_set(cmp); + if (i == __SCHED_FEAT_NR) + return -EINVAL; + + *ppos += cnt; + + return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); +#endif /* CONFIG_SCHED_DEBUG */ + +/* + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +const_debug unsigned int sysctl_sched_nr_migrate = 32; + +/* + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s + */ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; + +/* + * period over which we measure -rt task cpu usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +__read_mostly int scheduler_running; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + + + +/* + * __task_rq_lock - lock the rq @p resides on. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; + raw_spin_unlock(&rq->lock); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + } +} + +static void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) + __releases(p->pi_lock) +{ + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + +/* + * this_rq_lock - lock this runqueue and disable interrupts. + */ +static struct rq *this_rq_lock(void) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + raw_spin_lock(&rq->lock); + + return rq; +} + +#ifdef CONFIG_SCHED_HRTICK +/* + * Use HR-timers to deliver accurate preemption points. + * + * Its all a bit involved since we cannot program an hrt while holding the + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a + * reschedule event. + * + * When we get rescheduled we reprogram the hrtick_timer outside of the + * rq->lock. + */ + +static void hrtick_clear(struct rq *rq) +{ + if (hrtimer_active(&rq->hrtick_timer)) + hrtimer_cancel(&rq->hrtick_timer); +} + +/* + * High-resolution timer tick. + * Runs from hardirq context with interrupts disabled. + */ +static enum hrtimer_restart hrtick(struct hrtimer *timer) +{ + struct rq *rq = container_of(timer, struct rq, hrtick_timer); + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + rq->curr->sched_class->task_tick(rq, rq->curr, 1); + raw_spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_SMP +/* + * called from hardirq (IPI) context + */ +static void __hrtick_start(void *arg) +{ + struct rq *rq = arg; + + raw_spin_lock(&rq->lock); + hrtimer_restart(&rq->hrtick_timer); + rq->hrtick_csd_pending = 0; + raw_spin_unlock(&rq->lock); +} + +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +void hrtick_start(struct rq *rq, u64 delay) +{ + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + + hrtimer_set_expires(timer, time); + + if (rq == this_rq()) { + hrtimer_restart(timer); + } else if (!rq->hrtick_csd_pending) { + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + rq->hrtick_csd_pending = 1; + } +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + hrtick_clear(cpu_rq(cpu)); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static __init void init_hrtick(void) +{ + hotcpu_notifier(hotplug_hrtick, 0); +} +#else +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +void hrtick_start(struct rq *rq, u64 delay) +{ + __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SMP */ + +static void init_rq_hrtick(struct rq *rq) +{ +#ifdef CONFIG_SMP + rq->hrtick_csd_pending = 0; + + rq->hrtick_csd.flags = 0; + rq->hrtick_csd.func = __hrtick_start; + rq->hrtick_csd.info = rq; +#endif + + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; +} +#else /* CONFIG_SCHED_HRTICK */ +static inline void hrtick_clear(struct rq *rq) +{ +} + +static inline void init_rq_hrtick(struct rq *rq) +{ +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SCHED_HRTICK */ + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP +void resched_task(struct task_struct *p) +{ + int cpu; + + assert_raw_spin_locked(&task_rq(p)->lock); + + if (test_tsk_need_resched(p)) + return; + + set_tsk_need_resched(p); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); +} + +void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!raw_spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int cpu = smp_processor_id(); + int i; + struct sched_domain *sd; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) { + if (!idle_cpu(i)) { + cpu = i; + goto unlock; + } + } + } +unlock: + rcu_read_unlock(); + return cpu; +} +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +static void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_need_resched(rq->idle); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} + +static bool wake_up_full_nohz_cpu(int cpu) +{ + if (tick_nohz_full_cpu(cpu)) { + if (cpu != smp_processor_id() || + tick_nohz_tick_stopped()) + smp_send_reschedule(cpu); + return true; + } + + return false; +} + +void wake_up_nohz_cpu(int cpu) +{ + if (!wake_up_full_nohz_cpu(cpu)) + wake_up_idle_cpu(cpu); +} + +static inline bool got_nohz_idle_kick(void) +{ + int cpu = smp_processor_id(); + + if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + return false; + + if (idle_cpu(cpu) && !need_resched()) + return true; + + /* + * We can't run Idle Load Balance on this CPU for this time so we + * cancel it and clear NOHZ_BALANCE_KICK + */ + clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + return false; +} + +#else /* CONFIG_NO_HZ_COMMON */ + +static inline bool got_nohz_idle_kick(void) +{ + return false; +} + +#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_NO_HZ_FULL +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* Make sure rq->nr_running update is visible after the IPI */ + smp_rmb(); + + /* More than one running task need preemption */ + if (rq->nr_running > 1) + return false; + + return true; +} +#endif /* CONFIG_NO_HZ_FULL */ + +void sched_avg_update(struct rq *rq) +{ + s64 period = sched_avg_period(); + + while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); + rq->age_stamp += period; + rq->rt_avg /= 2; + } +} + +#else /* !CONFIG_SMP */ +void resched_task(struct task_struct *p) +{ + assert_raw_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} +#endif /* CONFIG_SMP */ + +#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ + (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +/* + * Iterate task_group tree rooted at *from, calling @down when first entering a + * node and @up when leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data) +{ + struct task_group *parent, *child; + int ret; + + parent = from; + +down: + ret = (*down)(parent, data); + if (ret) + goto out; + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + ret = (*up)(parent, data); + if (ret || parent == from) + goto out; + + child = parent; + parent = parent->parent; + if (parent) + goto up; +out: + return ret; +} + +int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +static void set_load_weight(struct task_struct *p) +{ + int prio = p->static_prio - MAX_RT_PRIO; + struct load_weight *load = &p->se.load; + + /* + * SCHED_IDLE tasks get minimal weight: + */ + if (p->policy == SCHED_IDLE) { + load->weight = scale_load(WEIGHT_IDLEPRIO); + load->inv_weight = WMULT_IDLEPRIO; + return; + } + + load->weight = scale_load(prio_to_weight[prio]); + load->inv_weight = prio_to_wmult[prio]; +} + +#ifdef CONFIG_MTK_SCHED_CMP_TGS +static void sched_tg_enqueue(struct rq *rq, struct task_struct *p) +{ + int id; + unsigned long flags; + struct task_struct *tg = p->group_leader; + + if(group_leader_is_empty(p)) + return; + id = get_cluster_id(rq->cpu); + if (unlikely(WARN_ON(id < 0))) + return; + + raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags); + tg->thread_group_info[id].nr_running++; + raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags); + +#if 0 + mt_sched_printf("enqueue %d:%s %d:%s %d %lu %lu %lu, %lu %lu %lu", + tg->pid, tg->comm, p->pid, p->comm, id, rq->cpu, + tg->thread_group_info[0].nr_running, + tg->thread_group_info[0].cfs_nr_running, + tg->thread_group_info[0].load_avg_ratio, + tg->thread_group_info[1].nr_running, + tg->thread_group_info[1].cfs_nr_running, + tg->thread_group_info[1].load_avg_ratio); +#endif + //tgs_log(rq, p); +} + +static void sched_tg_dequeue(struct rq *rq, struct task_struct *p) +{ + int id; + unsigned long flags; + struct task_struct *tg = p->group_leader; + + if(group_leader_is_empty(p)) + return; + id = get_cluster_id(rq->cpu); + if (unlikely(WARN_ON(id < 0))) + return; + + raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags); + //WARN_ON(!tg->thread_group_info[id].nr_running); + tg->thread_group_info[id].nr_running--; + raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags); + +#if 0 + mt_sched_printf("dequeue %d:%s %d:%s %d %d %lu %lu %lu, %lu %lu %lu", + tg->pid, tg->comm, p->pid, p->comm, id, rq->cpu, + tg->thread_group_info[0].nr_running, + tg->thread_group_info[0].cfs_nr_running, + tg->thread_group_info[0].load_avg_ratio, + tg->thread_group_info[1].nr_running, + tg->thread_group_info[1].cfs_nr_running, + tg->thread_group_info[1].load_avg_ratio); +#endif + //tgs_log(rq, p); +} + +#endif + +#ifdef CONFIG_MTK_SCHED_CMP_TGS +static void tgs_log(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_MT_SCHED_INFO + struct task_struct *tg = p->group_leader; + + if(group_leader_is_empty(p)) + return; + + // if(!strncmp(tg->comm,"sched_test", 10)){ + mt_sched_printf("%d:%s %d:%s %lu %lu %lu, %lu %lu %lu", tg->pid, tg->comm, p->pid, p->comm, + tg->thread_group_info[0].nr_running, + tg->thread_group_info[0].cfs_nr_running, + tg->thread_group_info[0].load_avg_ratio, + tg->thread_group_info[1].nr_running, + tg->thread_group_info[1].cfs_nr_running, + tg->thread_group_info[1].load_avg_ratio); + // } +#endif +} +#endif + +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +{ + update_rq_clock(rq); + sched_info_queued(p); + p->sched_class->enqueue_task(rq, p, flags); +#ifdef CONFIG_MTK_SCHED_CMP_TGS + sched_tg_enqueue(rq, p); + tgs_log(rq, p); +#endif +} + +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +{ + update_rq_clock(rq); + sched_info_dequeued(p); + p->sched_class->dequeue_task(rq, p, flags); +#ifdef CONFIG_MTK_SCHED_CMP_TGS + sched_tg_dequeue(rq, p); + tgs_log(rq, p); +#endif +} + +void activate_task(struct rq *rq, struct task_struct *p, int flags) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + + enqueue_task(rq, p, flags); + +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + if( 2 <= rq->nr_running){ + if (1 == cpumask_weight(&p->cpus_allowed)) + mt_lbprof_update_state_has_lock(rq->cpu, MT_LBPROF_AFFINITY_STATE); + else + mt_lbprof_update_state_has_lock(rq->cpu, MT_LBPROF_N_TASK_STATE); + }else if ( (1 == rq->nr_running)){ + mt_lbprof_update_state_has_lock(rq->cpu, MT_LBPROF_ONE_TASK_STATE); + } +#endif +} + +void deactivate_task(struct rq *rq, struct task_struct *p, int flags) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible++; + + dequeue_task(rq, p, flags); + +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + if ( 1 == rq->nr_running ) + mt_lbprof_update_state_has_lock(rq->cpu, MT_LBPROF_ONE_TASK_STATE); + else if (0 == rq->nr_running) + mt_lbprof_update_state_has_lock(rq->cpu, MT_LBPROF_NO_TASK_STATE); +#endif +} + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}irq region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}irq + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + if (static_key_false((¶virt_steal_rq_enabled))) { + u64 st; + + steal = paravirt_steal_clock(cpu_of(rq)); + steal -= rq->prev_steal_time_rq; + + if (unlikely(steal > delta)) + steal = delta; + + st = steal_ticks(steal); + steal = st * TICK_NSEC; + + rq->prev_steal_time_rq += steal; + + delta -= steal; + } +#endif + + rq->clock_task += delta; + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) + if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + sched_rt_avg_update(rq, irq_delta + steal); +#endif +} + +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + //struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct sched_param param = { .sched_priority = RTPM_PRIO_CPU_CALLBACK }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + +/* + * __normal_prio - return the priority that is based on the static prio + */ +static inline int __normal_prio(struct task_struct *p) +{ + return p->static_prio; +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(struct task_struct *p) +{ + int prio; + + if (task_has_rt_policy(p)) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(const struct task_struct *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +static inline void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio) +{ + if (prev_class != p->sched_class) { + if (prev_class->switched_from) + prev_class->switched_from(rq, p); + p->sched_class->switched_to(rq, p); +#ifdef CONFIG_MT_SCHED_INTEROP + if(p->on_rq){ + mt_sched_printf(sched_interop, "priority pid=%d comm=%s cpu=%d prev_prio=%d next_prio=%d", + p->pid, p->comm, task_cpu(p), oldprio, p->prio); + } +#endif + } else if (oldprio != p->prio) + p->sched_class->prio_changed(rq, p, oldprio); +} + +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +{ + const struct sched_class *class; + + if (p->sched_class == rq->curr->sched_class) { + rq->curr->sched_class->check_preempt_curr(rq, p, flags); + } else { + for_each_class(class) { + if (class == rq->curr->sched_class) + break; + if (class == p->sched_class) { + resched_task(rq->curr); + break; + } + } + } + + /* + * A queue event has occurred, and we're going to schedule. In + * this case, we can save a useless back to back clock update. + */ + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) + rq->skip_clock_update = 1; +} + +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); +} + +#ifdef CONFIG_SMP +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) +{ +#ifdef CONFIG_SCHED_DEBUG + /* + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. + */ + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + +#ifdef CONFIG_LOCKDEP + /* + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * sched_move_task() holds both and thus holding either pins the cgroup, + * see task_group(). + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). + */ + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); +#endif +#endif + + trace_sched_migrate_task(p, new_cpu); + + if (task_cpu(p) != new_cpu) { + struct task_migration_notifier tmn; + + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); + p->se.nr_migrations++; + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + + tmn.task = p; + tmn.from_cpu = task_cpu(p); + tmn.to_cpu = new_cpu; + + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); + } + + __set_task_cpu(p, new_cpu); +} + +struct migration_arg { + struct task_struct *task; + int dest_cpu; +}; + +static int migration_cpu_stop(void *data); + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, long match_state) +{ + unsigned long flags; + int running, on_rq; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_running()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &flags); + trace_sched_wait_task(p); + running = task_running(rq, p); + on_rq = p->on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &flags); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(on_rq)) { + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + * + * NOTE: this function doesn't have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. + */ +void kick_process(struct task_struct *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kick_process); +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SMP +/* + * ->cpus_allowed is protected by both rq->lock and p->pi_lock + */ +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int nid = cpu_to_node(cpu); + const struct cpumask *nodemask = NULL; + enum { cpuset, possible, fail } state = cpuset; + int dest_cpu; + + /* + * If the node that the cpu is on has been offlined, cpu_to_node() + * will return -1. There is no cpu on the node, and we should + * select the cpu on the other node. + */ + if (nid != -1) { + nodemask = cpumask_of_node(nid); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu(dest_cpu, nodemask) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return dest_cpu; + } + } + + for (;;) { + /* Any allowed, online CPU? */ + for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + goto out; + } + + switch (state) { + case cpuset: + /* No more Mr. Nice Guy. */ + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + + case possible: + do_set_cpus_allowed(p, cpu_possible_mask); + state = fail; + break; + + case fail: + BUG(); + break; + } + } + +out: + if (state != cpuset) { + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk_deferred("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + } + + return dest_cpu; +} + +/* + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. + */ +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need + * to rely on ttwu() to place the task on a valid ->cpus_allowed + * cpu. + * + * Since this is common to all placement strategies, this lives here. + * + * [ this allows ->select_task() to simply return task_cpu(p) and + * not worry about this generic constraint ] + */ + if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || + !cpu_online(cpu))) + cpu = select_fallback_rq(task_cpu(p), p); + + return cpu; +} + +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} +#endif + +static void +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) +{ +#ifdef CONFIG_SCHEDSTATS + struct rq *rq = this_rq(); + +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(); + + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + schedstat_inc(p, se.statistics.nr_wakeups_local); + } else { + struct sched_domain *sd; + + schedstat_inc(p, se.statistics.nr_wakeups_remote); + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + rcu_read_unlock(); + } + + if (wake_flags & WF_MIGRATED) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + +#endif /* CONFIG_SMP */ + + schedstat_inc(rq, ttwu_count); + schedstat_inc(p, se.statistics.nr_wakeups); + + if (wake_flags & WF_SYNC) + schedstat_inc(p, se.statistics.nr_wakeups_sync); + +#endif /* CONFIG_SCHEDSTATS */ +} + +static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +{ + activate_task(rq, p, en_flags); + p->on_rq = 1; + + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); +} + +/* + * Mark the task runnable and perform wakeup-preemption. + */ +static void +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ + check_preempt_curr(rq, p, wake_flags); + trace_sched_wakeup(p, true); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); + + if (rq->idle_stamp) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } +#endif +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ +#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; +#endif + + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_rq) { + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; +} + +#ifdef CONFIG_SMP +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct llist_node *llist = llist_del_all(&rq->wake_list); + struct task_struct *p; + + raw_spin_lock(&rq->lock); + + while (llist) { + p = llist_entry(llist, struct task_struct, wake_entry); + llist = llist_next(llist); + ttwu_do_activate(rq, p, 0); + } + + raw_spin_unlock(&rq->lock); +} +enum ipi_msg_type { + IPI_RESCHEDULE, + IPI_CALL_FUNC, + IPI_CALL_FUNC_SINGLE, + IPI_CPU_STOP, +}; +void scheduler_ipi(void) +{ + if (llist_empty(&this_rq()->wake_list) + && !tick_nohz_full_cpu(smp_processor_id()) + && !got_nohz_idle_kick()){ + mt_trace_ISR_start(IPI_RESCHEDULE); + mt_trace_ISR_end(IPI_RESCHEDULE); + return; + } + + /* + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. + */ + irq_enter(); + mt_trace_ISR_start(IPI_RESCHEDULE); + tick_nohz_full_check(); + sched_ttwu_pending(); + + /* + * Check if someone kicked us for doing the nohz idle load balance. + */ + if (unlikely(got_nohz_idle_kick())) { + this_rq()->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } + mt_trace_ISR_end(IPI_RESCHEDULE); + irq_exit(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) + smp_send_reschedule(cpu); +} + +bool cpus_share_cache(int this_cpu, int that_cpu) +{ + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); +} +#endif /* CONFIG_SMP */ + +static void ttwu_queue(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) + if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ + ttwu_queue_remote(p, cpu); + return; + } +#endif + + raw_spin_lock(&rq->lock); + ttwu_do_activate(rq, p, 0); + raw_spin_unlock(&rq->lock); +} + +/** + * try_to_wake_up - wake up a thread + * @p: the thread to be awakened + * @state: the mask of task states that can be woken + * @wake_flags: wake modifier flags (WF_*) + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. + */ +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +{ + unsigned long flags; + int cpu, success = 0; + + /* + * If we are going to wake up a thread waiting for CONDITION we + * need to ensure that CONDITION=1 done by the caller can not be + * reordered with p->state check below. This pairs with mb() in + * set_current_state() the waiting thread does. + */ + smp_mb__before_spinlock(); + raw_spin_lock_irqsave(&p->pi_lock, flags); + if (!(p->state & state)) + goto out; + + success = 1; /* we're going to change ->state */ + cpu = task_cpu(p); + + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; + +#ifdef CONFIG_SMP + /* + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. + */ + while (p->on_cpu) + cpu_relax(); + /* + * Pairs with the smp_wmb() in finish_lock_switch(). + */ + smp_rmb(); + + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(p); + + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) { +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + char strings[128]=""; +#endif + wake_flags |= WF_MIGRATED; +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + snprintf(strings, 128, "%d:%d:%s:wakeup:%d:%d:%s", task_cpu(current), current->pid, current->comm, cpu, p->pid, p->comm); + trace_sched_lbprof_log(strings); +#endif + set_task_cpu(p, cpu); + } +#endif /* CONFIG_SMP */ + + ttwu_queue(p, cpu); +stat: + ttwu_stat(p, cpu, wake_flags); +out: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return success; +} + +/** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not already there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + + lockdep_assert_held(&rq->lock); + + if (!raw_spin_trylock(&p->pi_lock)) { + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } + + if (!(p->state & TASK_NORMAL)) + goto out; + + if (!p->on_rq) + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + + ttwu_do_wakeup(rq, p, 0); + ttwu_stat(p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); +} + +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +int wake_up_process(struct task_struct *p) +{ + WARN_ON(task_is_stopped_or_traced(p)); + return try_to_wake_up(p, TASK_NORMAL, 0); +} +EXPORT_SYMBOL(wake_up_process); + +int wake_up_state(struct task_struct *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + * + * __sched_fork() is basic setup used by init_idle() too: + */ +static void __sched_fork(struct task_struct *p) +{ + p->on_rq = 0; + + p->se.on_rq = 0; + p->se.exec_start = 0; + p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); + +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + p->se.avg.runnable_avg_period = 0; + p->se.avg.runnable_avg_sum = 0; +#ifdef CONFIG_SCHED_HMP + /* keep LOAD_AVG_MAX in sync with fair.c if load avg series is changed */ +#define LOAD_AVG_MAX 47742 + if (p->mm) { + p->se.avg.hmp_last_up_migration = 0; + p->se.avg.hmp_last_down_migration = 0; + p->se.avg.load_avg_ratio = 1023; + p->se.avg.load_avg_contrib = + (1023 * scale_load_down(p->se.load.weight)); + p->se.avg.runnable_avg_period = LOAD_AVG_MAX; + p->se.avg.runnable_avg_sum = LOAD_AVG_MAX; + p->se.avg.usage_avg_sum = LOAD_AVG_MAX; + } +#endif +#endif +#ifdef CONFIG_SCHEDSTATS + memset(&p->se.statistics, 0, sizeof(p->se.statistics)); +#endif + + INIT_LIST_HEAD(&p->rt.run_list); + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif + +#ifdef CONFIG_NUMA_BALANCING + if (p->mm && atomic_read(&p->mm->mm_users) == 1) { + p->mm->numa_next_scan = jiffies; + p->mm->numa_next_reset = jiffies; + p->mm->numa_scan_seq = 0; + } + + p->node_stamp = 0ULL; + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; + p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; +#endif /* CONFIG_NUMA_BALANCING */ +} + +#ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_SCHED_DEBUG +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sched_feat_set("NUMA"); + else + sched_feat_set("NO_NUMA"); +} +#else +__read_mostly bool numabalancing_enabled; + +void set_numabalancing_state(bool enabled) +{ + numabalancing_enabled = enabled; +} +#endif /* CONFIG_SCHED_DEBUG */ +#endif /* CONFIG_NUMA_BALANCING */ + +/* + * fork()/clone()-time setup: + */ +void sched_fork(struct task_struct *p) +{ + unsigned long flags; + int cpu = get_cpu(); + + __sched_fork(p); + /* + * We mark the process as running here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + + /* + * Revert to default priority/policy on fork if requested. + */ + if (unlikely(p->sched_reset_on_fork)) { + if (task_has_rt_policy(p)) { + p->policy = SCHED_NORMAL; + p->static_prio = NICE_TO_PRIO(0); + p->rt_priority = 0; + } else if (PRIO_TO_NICE(p->static_prio) < 0) + p->static_prio = NICE_TO_PRIO(0); + + p->prio = p->normal_prio = __normal_prio(p); + set_load_weight(p); + + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } + + if (!rt_prio(p->prio)) + p->sched_class = &fair_sched_class; + + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + + /* + * The child is not yet in the pid-hash so no cgroup attach races, + * and the cgroup is pinned to this child due to cgroup_fork() + * is ran before sched_fork(). + * + * Silence PROVE_RCU. + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + set_task_cpu(p, cpu); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (likely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#if defined(CONFIG_SMP) + p->on_cpu = 0; +#endif +#ifdef CONFIG_PREEMPT_COUNT + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; +#endif +#ifdef CONFIG_SMP + plist_node_init(&p->pushable_tasks, MAX_PRIO); +#endif + + put_cpu(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void wake_up_new_task(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + + raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_SMP + /* + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug + */ + set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); +#endif + + /* Initialize new task's runnable average */ + init_task_runnable_average(p); + rq = __task_rq_lock(p); + activate_task(rq, p, 0); + p->on_rq = 1; + trace_sched_wakeup_new(p, true); + check_preempt_curr(rq, p, WF_FORK); +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); +#endif + task_rq_unlock(rq, p, &flags); +} + +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister + * + * This is safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) +{ + hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; + + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + struct preempt_notifier *notifier; + + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); +} + +#else /* !CONFIG_PREEMPT_NOTIFIERS */ + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ +} + +#endif /* CONFIG_PREEMPT_NOTIFIERS */ + +/** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + trace_sched_switch(prev, next); + sched_info_switch(prev, next); + perf_event_task_sched_out(prev, next); + fire_sched_out_preempt_notifiers(prev, next); + prepare_lock_switch(rq, next); + prepare_arch_switch(next); +} + +/** + * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch + * @prev: the thread we just switched away from. + * + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(struct rq *rq, struct task_struct *prev) + __releases(rq->lock) +{ + struct mm_struct *mm = rq->prev_mm; + long prev_state; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets TASK_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. + * The test for TASK_DEAD must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul <manfred@colorfullife.com> + */ + prev_state = prev->state; + vtime_task_switch(prev); + finish_arch_switch(prev); + perf_event_task_sched_in(prev, current); + finish_lock_switch(rq, prev); + finish_arch_post_lock_switch(); + + fire_sched_in_preempt_notifiers(current); + if (mm) + mmdrop(mm); + if (unlikely(prev_state == TASK_DEAD)) { + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); + put_task_struct(prev); + } + + tick_nohz_task_switch(current); +} + +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ + if (rq->post_schedule) { + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->curr->sched_class->post_schedule) + rq->curr->sched_class->post_schedule(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + rq->post_schedule = 0; + } +} + +#else + +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ +} + +#endif + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(struct task_struct *prev) + __releases(rq->lock) +{ + struct rq *rq = this_rq(); + + finish_task_switch(rq, prev); + + /* + * FIXME: do we need to worry about rq being invalidated by the + * task_switch? + */ + post_schedule(rq); + +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif + if (current->set_child_tid) + put_user(task_pid_vnr(current), current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline void +context_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + struct mm_struct *mm, *oldmm; + + prepare_task_switch(rq, prev, next); + +#ifdef CONFIG_MT65XX_TRACER + if(get_mt65xx_mon_mode() == MODE_SCHED_SWITCH) + trace_mt65xx_mon_sched_switch(prev, next); +#endif + mm = next->mm; + oldmm = prev->active_mm; + /* + * For paravirt, this is coupled with an exit in switch_to to + * combine the page table reload and the switch backend into + * one hypercall. + */ + arch_start_context_switch(prev); + + if (!mm) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (!prev->mm) { + prev->active_mm = NULL; + rq->prev_mm = oldmm; + } + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#endif + + context_tracking_task_switch(prev, next); + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); +} + +/* + * nr_running and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, total number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +unsigned long long nr_context_switches(void) +{ + int i; + unsigned long long sum = 0; + + for_each_possible_cpu(i) + sum += cpu_rq(i)->nr_switches; + + return sum; +} + +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; + + for_each_possible_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + + return sum; +} + +unsigned long nr_iowait_cpu(int cpu) +{ + struct rq *this = cpu_rq(cpu); + return atomic_read(&this->nr_iowait); +} + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + +unsigned long get_cpu_load(int cpu) +{ + struct rq *this = cpu_rq(cpu); + return this->cpu_load[0]; +} +EXPORT_SYMBOL(get_cpu_load); + +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +static long calc_load_fold_active(struct rq *this_rq) +{ + long nr_active, delta = 0; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + } + + return delta; +} + +/* + * a1 = a0 * e + a * (1 - e) + */ +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; + +static inline int calc_load_write_idx(void) +{ + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); + long delta; + + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ + delta = calc_load_fold_active(this_rq); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } +} + +void calc_load_exit_idle(void) +{ + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + /* + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. + */ + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); + + return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(void) +{ + long delta, active, n; + + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; +} +#else /* !CONFIG_NO_HZ_COMMON */ + +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(unsigned long ticks) +{ + long active, delta; + + if (time_before(jiffies, calc_load_update + 10)) + return; + + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; + + /* + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + */ + calc_global_nohz(); +} + +/* + * Called from update_cpu_load() to periodically update this CPU's + * active count. + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long delta; + + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + delta = calc_load_fold_active(this_rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + this_rq->calc_load_update += LOAD_FREQ; +} + +/* + * End of global load-average stuff + */ + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) +{ + int i, scale; + + this_rq->nr_load_updates++; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +# ifdef CONFIG_SMP +/* moved to kernel/sched/proc.c at Linux 3.11-rc4 */ +static inline unsigned long get_rq_runnable_load(struct rq *rq) +{ + return rq->cfs.runnable_load_avg; +} +# else +static inline unsigned long get_rq_runnable_load(struct rq *rq) +{ + return rq->load.weight; +} +# endif + +#ifdef CONFIG_NO_HZ_COMMON +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +/* moved to kernel/sched/proc.c at Linux 3.11-rc4 */ +void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long load = this_rq->load.weight; + unsigned long pending_updates; + + /* + * bail if there's load or we're actually up-to-date. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +/* moved to kernel/sched/proc.c at Linux 3.11-rc4 */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * Called from scheduler_tick() + */ +/* moved to kernel/sched/proc.c at Linux 3.11-rc4 */ +static void update_cpu_load_active(struct rq *this_rq) +{ + unsigned long load = get_rq_runnable_load(this_rq); + /* + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, load, 1); + + calc_load_account_active(this_rq); +} + +#ifdef CONFIG_SMP + +/* + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. + */ +void sched_exec(void) +{ + struct task_struct *p = current; + unsigned long flags; + int dest_cpu; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); + if (dest_cpu == smp_processor_id()) + goto unlock; + + if (likely(cpu_active(dest_cpu))) { + struct migration_arg arg = { p, dest_cpu }; + + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); + return; + } +unlock: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +#endif + +DEFINE_PER_CPU(struct kernel_stat, kstat); +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); + +EXPORT_PER_CPU_SYMBOL(kstat); +EXPORT_PER_CPU_SYMBOL(kernel_cpustat); + +/* + * Return any ns on the sched_clock that have not yet been accounted in + * @p in case that task is currently running. + * + * Called with task_rq_lock() held on @rq. + */ +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) +{ + u64 ns = 0; + + if (task_current(rq, p)) { + update_rq_clock(rq); + ns = rq->clock_task - p->se.exec_start; + if ((s64)ns < 0) + ns = 0; + } + + return ns; +} + +unsigned long long task_delta_exec(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); + ns = do_task_delta_exec(p, rq); + task_rq_unlock(rq, p, &flags); + + return ns; +} + +/* + * Return accounted runtime for the task. + * In case the task is currently running, return the runtime plus current's + * pending runtime that have not been accounted yet. + */ +unsigned long long task_sched_runtime(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); + ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + task_rq_unlock(rq, p, &flags); + + return ns; +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +void scheduler_tick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct task_struct *curr = rq->curr; + + sched_clock_tick(); + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + curr->sched_class->task_tick(rq, curr, 0); + update_cpu_load_active(rq); +#ifdef CONFIG_MT_RT_SCHED + mt_check_rt_policy(rq); +#endif + raw_spin_unlock(&rq->lock); + + perf_event_task_tick(); +#ifdef CONFIG_MT_SCHED_MONITOR + if(smp_processor_id() == 0) //only record by CPU#0 + mt_save_irq_counts(); +#endif +#ifdef CONFIG_SMP + rq->idle_balance = idle_cpu(cpu); + trigger_load_balance(rq, cpu); +#endif + rq_last_tick_reset(rq); +} + +#ifdef CONFIG_NO_HZ_FULL +/** + * scheduler_tick_max_deferment + * + * Keep at least one tick per second when a single + * active task is running because the scheduler doesn't + * yet completely support full dynticks environment. + * + * This makes sure that uptime, CFS vruntime, load + * balancing, etc... continue to move forward, even + * with a very low granularity. + */ +u64 scheduler_tick_max_deferment(void) +{ + struct rq *rq = this_rq(); + unsigned long next, now = ACCESS_ONCE(jiffies); + + next = rq->last_sched_tick + HZ; + + if (time_before_eq(next, now)) + return 0; + + return jiffies_to_usecs(next - now) * NSEC_PER_USEC; +} +#endif + +notrace unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} + +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +#ifdef CONFIG_MT_DEBUG_PREEMPT +#define ADD_PREEMPT 0 +#define SUB_PREEMPT 1 +void preempt_dump_backtrace(int type) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct task_struct *curr = rq->curr; + if( curr!=NULL ){ + if((preempt_count()== 2) && (0==strncmp(curr->comm, "swapper", 7))){ + unsigned long entries[12]={0}; + struct stack_trace trace; + + trace.nr_entries = 0; + trace.max_entries = ARRAY_SIZE(entries); + trace.entries = entries; + trace.skip = 2; + + save_stack_trace(&trace); + + if(type==ADD_PREEMPT){ + mt_sched_printf(sched_preempt, "addpreempt0 %lx %lx %lx %lx %lx %lx", + entries[0], entries[1], entries[2], entries[3], entries[4], entries[5]); + mt_sched_printf(sched_preempt, "addpreempt1 %lx %lx %lx %lx %lx %lx", + entries[6], entries[7], entries[8], entries[9], entries[10], entries[11]); + }else if(type==SUB_PREEMPT){ + mt_sched_printf(sched_preempt, "subpreempt0 %lx %lx %lx %lx %lx %lx", + entries[0], entries[1], entries[2], entries[3], entries[4], entries[5]); + mt_sched_printf(sched_preempt, "subpreempt1 %lx %lx %lx %lx %lx %lx", + entries[6], entries[7], entries[8], entries[9], entries[10], entries[11]); + } + + } + } +} + +#endif +void __kprobes add_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) + return; +#endif + preempt_count() += val; + +#ifdef CONFIG_MT_DEBUG_PREEMPT + preempt_dump_backtrace(ADD_PREEMPT); + +#endif + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Spinlock count overflowing soon? + */ + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); +#endif + //if (preempt_count() == val) + // trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + if (preempt_count() == (val & ~PREEMPT_ACTIVE)){ +#ifdef CONFIG_PREEMPT_TRACER + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +#endif +#ifdef CONFIG_PREEMPT_MONITOR + if(unlikely(__raw_get_cpu_var(mtsched_mon_enabled) & 0x1)){ + //current->t_add_prmpt = sched_clock(); + MT_trace_preempt_off(); + } +#endif + } +} +EXPORT_SYMBOL(add_preempt_count); + +void __kprobes sub_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) + return; + /* + * Is the spinlock portion underflowing? + */ + if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; +#endif + +#ifdef CONFIG_MT_DEBUG_PREEMPT + preempt_dump_backtrace(SUB_PREEMPT); +#endif + + //if (preempt_count() == val) + // trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + if (preempt_count() == (val & ~PREEMPT_ACTIVE)){ +#ifdef CONFIG_PREEMPT_TRACER + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +#endif +#ifdef CONFIG_PREEMPT_MONITOR + if(unlikely(__raw_get_cpu_var(mtsched_mon_enabled) & 0x1)){ + MT_trace_preempt_on(); + } +#endif + } + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * Print scheduling while atomic bug: + */ +static noinline void __schedule_bug(struct task_struct *prev) +{ + if (oops_in_progress) + return; + + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); + + debug_show_held_locks(prev); + print_modules(); + if (irqs_disabled()) + print_irqtrace_events(prev); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); + BUG_ON(1); +} + +/* + * Various schedule()-time debugging checks and statistics: + */ +static inline void schedule_debug(struct task_struct *prev) +{ + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + __schedule_bug(prev); + rcu_sleep_check(); + + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + + schedstat_inc(this_rq(), sched_count); +} + +static void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + if (prev->on_rq || rq->skip_clock_update < 0) + update_rq_clock(rq); + prev->sched_class->put_prev_task(rq, prev); +} + +/* + * Pick up the highest-prio task: + */ +static inline struct task_struct * +pick_next_task(struct rq *rq) +{ + const struct sched_class *class; + struct task_struct *p; + + /* + * Optimization: we know that if all tasks are in + * the fair class we can call that function directly: + */ + if (likely(rq->nr_running == rq->cfs.h_nr_running)) { + p = fair_sched_class.pick_next_task(rq); + if (likely(p)) + return p; + } + + for_each_class(class) { + p = class->pick_next_task(rq); + if (p) + return p; + } + + BUG(); /* the idle class will always have a runnable task */ + + return 0; +} + +/* + * __schedule() is the main scheduler function. + * + * The main means of driving the scheduler and thus entering this function are: + * + * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. + * + * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return + * paths. For example, see arch/x86/entry_64.S. + * + * To drive preemption between tasks, the scheduler sets the flag in timer + * interrupt handler scheduler_tick(). + * + * 3. Wakeups don't really cause entry into schedule(). They add a + * task to the run-queue and that's it. + * + * Now, if the new task added to the run-queue preempts the current + * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets + * called on the nearest possible occasion: + * + * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * + * - in syscall or exception context, at the next outmost + * preempt_enable(). (this might be as soon as the wake_up()'s + * spin_unlock()!) + * + * - in IRQ context, return from interrupt-handler to + * preemptible context + * + * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * then at the next: + * + * - cond_resched() call + * - explicit schedule() call + * - return from syscall or exception to user-space + * - return from interrupt-handler to user-space + */ +static void __sched __schedule(void) +{ + struct task_struct *prev, *next; + unsigned long *switch_count; + struct rq *rq; + int cpu; + +need_resched: + preempt_disable(); + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + rcu_note_context_switch(cpu); + prev = rq->curr; + + schedule_debug(prev); + + if (sched_feat(HRTICK)) + hrtick_clear(rq); +#ifdef CONFIG_MT_SCHED_MONITOR + __raw_get_cpu_var(MT_trace_in_sched) = 1; +#endif + + /* + * Make sure that signal_pending_state()->signal_pending() below + * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) + * done by the caller to avoid the race with signal_wake_up(). + */ + smp_mb__before_spinlock(); + raw_spin_lock_irq(&rq->lock); + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (unlikely(signal_pending_state(prev->state, prev))) { + prev->state = TASK_RUNNING; + } else { + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; + + /* + * If a worker went to sleep, notify and ask workqueue + * whether it wants to wake up a task to maintain + * concurrency. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev, cpu); + if (to_wakeup) + try_to_wake_up_local(to_wakeup); + } + } + switch_count = &prev->nvcsw; + } + + pre_schedule(rq, prev); + + if (unlikely(!rq->nr_running)) + idle_balance(cpu, rq); + + put_prev_task(rq, prev); + next = pick_next_task(rq); + clear_tsk_need_resched(prev); + rq->skip_clock_update = 0; + + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + context_switch(rq, prev, next); /* unlocks the rq */ + /* + * The context switch have flipped the stack from under us + * and restored the local variables which were saved when + * this task called schedule() in the past. prev == current + * is still correct, but it can be moved to another cpu/rq. + */ + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + } else + raw_spin_unlock_irq(&rq->lock); + +#ifdef CONFIG_MT_RT_SCHED + mt_post_schedule(rq); +#endif +#ifdef CONFIG_MT_SCHED_MONITOR + __raw_get_cpu_var(MT_trace_in_sched) = 0; +#endif + post_schedule(rq); + + sched_preempt_enable_no_resched(); + if (need_resched()) + goto need_resched; +} + +static inline void sched_submit_work(struct task_struct *tsk) +{ + if (!tsk->state || tsk_is_pi_blocked(tsk)) + return; + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(tsk)) + blk_schedule_flush_plug(tsk); +} + +asmlinkage void __sched schedule(void) +{ + struct task_struct *tsk = current; + + sched_submit_work(tsk); + __schedule(); +} +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +asmlinkage void __sched schedule_user(void) +{ + /* + * If we come here after a random call to set_need_resched(), + * or we have been woken up remotely but the IPI has not yet arrived, + * we haven't yet exited the RCU idle mode. Do it here manually until + * we find a better solution. + */ + user_exit(); + schedule(); + user_enter(); +} +#endif + +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 + */ +void __sched schedule_preempt_disabled(void) +{ + sched_preempt_enable_no_resched(); + schedule(); + preempt_disable(); +} + +#ifdef CONFIG_PREEMPT +/* + * this is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched notrace preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); + + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (likely(ti->preempt_count || irqs_disabled())) + return; + + do { + add_preempt_count_notrace(PREEMPT_ACTIVE); + __schedule(); + sub_preempt_count_notrace(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (need_resched()); +} +EXPORT_SYMBOL(preempt_schedule); + +/* + * this is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage void __sched preempt_schedule_irq(void) +{ + struct thread_info *ti = current_thread_info(); + enum ctx_state prev_state; + + /* Catch callers which need to be fixed */ + BUG_ON(ti->preempt_count || !irqs_disabled()); + + prev_state = exception_enter(); + + do { + add_preempt_count(PREEMPT_ACTIVE); + local_irq_enable(); + __schedule(); + local_irq_disable(); + sub_preempt_count(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (need_resched()); + + exception_exit(prev_state); +} + +#endif /* CONFIG_PREEMPT */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, + void *key) +{ + return try_to_wake_up(curr->private, mode, wake_flags); +} +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) +{ + wait_queue_t *curr, *next; + + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + unsigned flags = curr->flags; + + if (curr->func(curr, mode, wake_flags, key) && + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +{ + __wake_up_common(q, mode, nr, 0, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked); + +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ + __wake_up_common(q, mode, 1, 0, key); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key); + +/** + * __wake_up_sync_key - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + int wake_flags = WF_SYNC; + + if (unlikely(!q)) + return; + + if (unlikely(!nr_exclusive)) + wake_flags = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + __wake_up_sync_key(q, mode, nr_exclusive, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + __add_wait_queue_tail_exclusive(&x->wait, &wait); + do { + if (signal_pending_state(state, current)) { + timeout = -ERESTARTSYS; + break; + } + __set_current_state(state); + spin_unlock_irq(&x->wait.lock); + timeout = action(timeout); + spin_lock_irq(&x->wait.lock); + } while (!x->done && timeout); + __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; + } + x->done--; + return timeout ?: 1; +} + +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + timeout = do_wait_for_common(x, action, timeout, state); + spin_unlock_irq(&x->wait.lock); + return timeout; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Returns: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Returns: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(completion_done); + +static long __sched +sleep_on_common(wait_queue_head_t *q, int state, long timeout) +{ + unsigned long flags; + wait_queue_t wait; + + init_waitqueue_entry(&wait, current); + + __set_current_state(state); + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, &wait); + spin_unlock(&q->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&q->lock); + __remove_wait_queue(q, &wait); + spin_unlock_irqrestore(&q->lock, flags); + + return timeout; +} + +void __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(interruptible_sleep_on); + +long __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void __sched sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(sleep_on); + +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(sleep_on_timeout); + +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(struct task_struct *p, int prio) +{ + int oldprio, on_rq, running; + struct rq *rq; + const struct sched_class *prev_class; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = __task_rq_lock(p); + + /* + * Idle task boosting is a nono in general. There is one + * exception, when PREEMPT_RT and NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + + trace_sched_pi_setprio(p, prio); + oldprio = p->prio; + prev_class = p->sched_class; + on_rq = p->on_rq; + running = task_current(rq, p); + if (on_rq) + dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + p->prio = prio; + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); + + check_class_changed(rq, p, prev_class, oldprio); +out_unlock: + __task_rq_unlock(rq); +} +#endif + +#ifdef CONFIG_MT_PRIO_TRACER +void set_user_nice_core(struct task_struct *p, long nice) +{ + int old_prio, delta, on_rq; + unsigned long flags; + struct rq *rq; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * SCHED_FIFO/SCHED_RR: + */ + if (task_has_rt_policy(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + on_rq = p->on_rq; + if (on_rq) + dequeue_task(rq, p, 0); + + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; + + if (on_rq) { + enqueue_task(rq, p, 0); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, p, &flags); +} + +void set_user_nice(struct task_struct *p, long nice) +{ + set_user_nice_core(p, nice); + /* setting nice implies to set a normal sched policy */ + update_prio_tracer(task_pid_nr(p), NICE_TO_PRIO(nice), 0, PTS_KRNL); +} +#else /* !CONFIG_MT_PRIO_TRACER */ +void set_user_nice(struct task_struct *p, long nice) +{ + int old_prio, delta, on_rq; + unsigned long flags; + struct rq *rq; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * SCHED_FIFO/SCHED_RR: + */ + if (task_has_rt_policy(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + on_rq = p->on_rq; + if (on_rq) + dequeue_task(rq, p, 0); + + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; + + if (on_rq) { + enqueue_task(rq, p, 0); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, p, &flags); +} +#endif +EXPORT_SYMBOL(set_user_nice); + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = 20 - nice; + + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || + capable(CAP_SYS_NICE)); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ + long nice, retval; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < -40) + increment = -40; + if (increment > 40) + increment = 40; + + nice = TASK_NICE(current) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + if (increment < 0 && !can_nice(current, nice)) + return -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; +#ifdef CONFIG_MT_PRIO_TRACER + set_user_nice_syscall(current, nice); +#else + set_user_nice(current, nice); +#endif + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(const struct task_struct *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(const struct task_struct *p) +{ + return TASK_NICE(p); +} +EXPORT_SYMBOL(task_nice); + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle) + return 0; + + if (rq->nr_running) + return 0; + +#ifdef CONFIG_SMP + if (!llist_empty(&rq->wake_list)) + return 0; +#endif + + return 1; +} + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + */ +struct task_struct *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static struct task_struct *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_vpid(pid) : current; +} + +extern struct cpumask hmp_slow_cpu_mask; + +/* Actually do priority change: must hold rq lock. */ +static void +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +{ + p->policy = policy; + p->rt_priority = prio; + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); + if (rt_prio(p->prio)) { + p->sched_class = &rt_sched_class; + } + else + p->sched_class = &fair_sched_class; + set_load_weight(p); +} + +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + bool match; + + rcu_read_lock(); + pcred = __task_cred(p); + match = (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); + rcu_read_unlock(); + return match; +} + +static int check_mt_allow_rt(struct sched_param *param) +{ + int allow = 0; + if(0 == MT_ALLOW_RT_PRIO_BIT){ + //this condition check will be removed + return 1; + } + + if(param->sched_priority & MT_ALLOW_RT_PRIO_BIT){ + param->sched_priority &= ~MT_ALLOW_RT_PRIO_BIT; + allow = 1; + } + return allow; +} + +static int __sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool user) +{ + int retval, oldprio, oldpolicy = -1, on_rq, running; + unsigned long flags; + const struct sched_class *prev_class; + struct rq *rq; + int reset_on_fork; + + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); +recheck: + /* double check policy once rq lock held */ + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; + policy = oldpolicy = p->policy; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + } + + if(rt_policy(policy)){ + if (!check_mt_allow_rt((struct sched_param *)param)){ + printk("[RT_MONITOR]WARNNING [%d:%s] SET NOT ALLOW RT Prio [%d] for proc [%d:%s]\n", current->pid, current->comm, param->sched_priority, p->pid, p->comm); + //dump_stack(); + } + } + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. + */ + if (param->sched_priority < 0 || + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) + return -EINVAL; + if (rt_policy(policy) != (param->sched_priority != 0)) + return -EINVAL; + + /* + * Allow unprivileged RT tasks to decrease priority: + */ + if (user && !capable(CAP_SYS_NICE)) { + if (rt_policy(policy)) { + unsigned long rlim_rtprio = + task_rlimit(p, RLIMIT_RTPRIO); + + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; + + /* can't increase priority */ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; + } + + /* + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. + */ + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (!can_nice(p, TASK_NICE(p))) + return -EPERM; + } + + /* can't change other user's priorities */ + if (!check_same_owner(p)) + return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; + } + + if (user) { + retval = security_task_setscheduler(p); + if (retval) + return retval; + } + + /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + * + * To be able to change p->policy safely, the appropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + + /* + * Changing the policy of the stop threads its a very bad idea + */ + if (p == rq->stop) { + task_rq_unlock(rq, p, &flags); + return -EINVAL; + } + + /* + * If not changing anything there's no need to proceed further: + */ + if (unlikely(policy == p->policy && (!rt_policy(policy) || + param->sched_priority == p->rt_priority))) { + task_rq_unlock(rq, p, &flags); + return 0; + } + +#ifdef CONFIG_RT_GROUP_SCHED + if (user) { + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { + task_rq_unlock(rq, p, &flags); + return -EPERM; + } + } +#endif + + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, p, &flags); + goto recheck; + } + on_rq = p->on_rq; + running = task_current(rq, p); + if (on_rq) + dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + p->sched_reset_on_fork = reset_on_fork; + + oldprio = p->prio; + prev_class = p->sched_class; + __setscheduler(rq, p, policy, param->sched_priority); + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, p, 0); + + check_class_changed(rq, p, prev_class, oldprio); + task_rq_unlock(rq, p, &flags); + + rt_mutex_adjust_pi(p); + + return 0; +} + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * NOTE that the task may be already dead. + */ +#ifdef CONFIG_MT_PRIO_TRACER +int sched_setscheduler_core(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, true); +} + +int sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param) +{ + int retval; + + retval = sched_setscheduler_core(p, policy, param); + if (!retval) { + int prio = param->sched_priority & ~MT_ALLOW_RT_PRIO_BIT; + if (!rt_policy(policy)) + prio = __normal_prio(p); + else + prio = MAX_RT_PRIO-1 - prio; + update_prio_tracer(task_pid_nr(p), prio, policy, PTS_KRNL); + } + return retval; +} +#else /* !CONFIG_MT_PRIO_TRACER */ +int sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, true); +} +#endif +EXPORT_SYMBOL_GPL(sched_setscheduler); + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + */ +#ifdef CONFIG_MT_PRIO_TRACER +int sched_setscheduler_nocheck_core(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, false); +} + + +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + const struct sched_param *param) +{ + int retval; + + retval = sched_setscheduler_nocheck_core(p, policy, param); + if (!retval) { + int prio = param->sched_priority & ~MT_ALLOW_RT_PRIO_BIT; + if (!rt_policy(policy)) + prio = __normal_prio(p); + else + prio = MAX_RT_PRIO-1 - prio; + update_prio_tracer(task_pid_nr(p), prio, policy, PTS_KRNL); + } + return retval; +} +#else /* !CONFIG_MT_PRIO_TRACER */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, false); +} +#endif + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lparam; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); +#ifdef CONFIG_MT_PRIO_TRACER + if (p != NULL) + retval = sched_setscheduler_syscall(p, policy, &lparam); +#else + if (p != NULL) + retval = sched_setscheduler(p, policy, &lparam); +#endif + + rcu_read_unlock(); + + return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, + struct sched_param __user *, param) +{ + /* negative values for policy are not valid */ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) +{ + return do_sched_setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +{ + struct task_struct *p; + int retval; + + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + rcu_read_lock(); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); + } + rcu_read_unlock(); + return retval; +} + +/** + * sys_sched_getparam - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +{ + struct sched_param lp; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + rcu_read_unlock(); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + cpumask_var_t cpus_allowed, new_mask; + struct task_struct *p; + int retval; + + get_online_cpus(); + rcu_read_lock(); + + p = find_process_by_pid(pid); + if (!p) { + rcu_read_unlock(); + put_online_cpus(); + printk(KERN_DEBUG "SCHED: setaffinity find process %d fail\n", pid); + return -ESRCH; + } + + /* Prevent p going away */ + get_task_struct(p); + rcu_read_unlock(); + + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + printk(KERN_DEBUG "SCHED: setaffinity flags PF_NO_SETAFFINITY fail\n"); + goto out_put_task; + } + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + printk(KERN_DEBUG "SCHED: setaffinity allo_cpumask_var for cpus_allowed fail\n"); + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + printk(KERN_DEBUG "SCHED: setaffinity allo_cpumask_var for new_mask fail\n"); + goto out_free_cpus_allowed; + } + retval = -EPERM; + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + printk(KERN_DEBUG "SCHED: setaffinity check_same_owner and task_ns_capable fail\n"); + goto out_unlock; + } + rcu_read_unlock(); + } + + retval = security_task_setscheduler(p); + if (retval){ + printk(KERN_DEBUG "SCHED: setaffinity security_task_setscheduler fail, status: %d\n", retval); + goto out_unlock; + } + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); +again: + retval = set_cpus_allowed_ptr(p, new_mask); + if (retval) + printk(KERN_DEBUG "SCHED: set_cpus_allowed_ptr status %d\n", retval); + + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + } +out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: + put_task_struct(p); + put_online_cpus(); + if (retval) + printk(KERN_DEBUG "SCHED: setaffinity status %d\n", retval); +#ifdef CONFIG_MT_SCHED_INTEROP + else + mt_sched_printf(sched_interop, "set affinity pid=%d comm=%s affinity=%ld", + p->pid, p->comm, p->cpus_allowed.bits[0]); +#endif + + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + struct cpumask *new_mask) +{ + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + cpumask_var_t new_mask; + int retval; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; +} + +long sched_getaffinity(pid_t pid, struct cpumask *mask) +{ + struct task_struct *p; + unsigned long flags; + int retval; + + get_online_cpus(); + rcu_read_lock(); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p){ + printk(KERN_DEBUG "SCHED: getaffinity find process %d fail\n", pid); + goto out_unlock; + } + + retval = security_task_getscheduler(p); + if (retval){ + printk(KERN_DEBUG "SCHED: getaffinity security_task_getscheduler fail, status: %d\n", retval); + goto out_unlock; + } + + raw_spin_lock_irqsave(&p->pi_lock, flags); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +out_unlock: + rcu_read_unlock(); + put_online_cpus(); + + if (retval){ + printk(KERN_DEBUG "SCHED: getaffinity status %d\n", retval); + } + return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + int ret; + cpumask_var_t mask; + + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) + return -EINVAL; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + size_t retlen = min_t(size_t, len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, mask, retlen)) + ret = -EFAULT; + else + ret = retlen; + } + free_cpumask_var(mask); + + return ret; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + */ +SYSCALL_DEFINE0(sched_yield) +{ + struct rq *rq = this_rq_lock(); + + schedstat_inc(rq, yld_count); + current->sched_class->yield_task(rq); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + do_raw_spin_unlock(&rq->lock); + sched_preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +static inline int should_resched(void) +{ + return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} + +static void __cond_resched(void) +{ + add_preempt_count(PREEMPT_ACTIVE); + __schedule(); + sub_preempt_count(PREEMPT_ACTIVE); +} + +int __sched _cond_resched(void) +{ + if (should_resched()) { + __cond_resched(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(_cond_resched); + +/* + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int __cond_resched_lock(spinlock_t *lock) +{ + int resched = should_resched(); + int ret = 0; + + lockdep_assert_held(lock); + + if (spin_needbreak(lock) || resched) { + spin_unlock(lock); + if (resched) + __cond_resched(); + else + cpu_relax(); + ret = 1; + spin_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_lock); + +int __sched __cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (should_resched()) { + local_bh_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(__cond_resched_softirq); + +/** + * yield - yield the current processor to other threads. + * + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, its already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} +EXPORT_SYMBOL(yield); + +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Returns: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. + */ +bool __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + unsigned long flags; + int yielded = 0; + + local_irq_save(flags); + rq = this_rq(); + +again: + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) { + yielded = -ESRCH; + goto out_irq; + } + + double_rq_lock(rq, p_rq); + while (task_rq(p) != p_rq) { + double_rq_unlock(rq, p_rq); + goto again; + } + + if (!curr->sched_class->yield_to_task) + goto out_unlock; + + if (curr->sched_class != p->sched_class) + goto out_unlock; + + if (task_running(p_rq, p) || p->state) + goto out_unlock; + + yielded = curr->sched_class->yield_to_task(rq, p, preempt); + if (yielded) { + schedstat_inc(rq, yld_count); + /* + * Make p's CPU reschedule; pick_next_entity takes care of + * fairness. + */ + if (preempt && rq != p_rq) + resched_task(p_rq->curr); + } + +out_unlock: + double_rq_unlock(rq, p_rq); +out_irq: + local_irq_restore(flags); + + if (yielded > 0) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + */ +void __sched io_schedule(void) +{ + struct rq *rq = raw_rq(); + + delayacct_blkio_start(); + atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); + current->in_iowait = 1; + schedule(); + current->in_iowait = 0; + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); +} +EXPORT_SYMBOL(io_schedule); + +long __sched io_schedule_timeout(long timeout) +{ + struct rq *rq = raw_rq(); + long ret; + + delayacct_blkio_start(); + atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); + current->in_iowait = 1; + ret = schedule_timeout(timeout); + current->in_iowait = 0; + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); + return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + } + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) +{ + struct task_struct *p; + unsigned int time_slice; + unsigned long flags; + struct rq *rq; + int retval; + struct timespec t; + + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + rcu_read_lock(); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + rq = task_rq_lock(p, &flags); + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, p, &flags); + + rcu_read_unlock(); + jiffies_to_timespec(time_slice, &t); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; +#ifdef CONFIG_MT_DEBUG_MUTEXES +void mt_mutex_state(struct task_struct *p) +{ + struct task_struct *locker; + if(p->blocked_on){ + locker = p->blocked_on->task_wait_on; + if(locker == NULL) return; + if(find_task_by_vpid(locker->pid) != NULL){ + printk("Hint: wait on mutex, holder is [%d:%s:%ld]\n", locker->pid, locker->comm, locker->state); + if(locker->state != TASK_RUNNING){ + printk("Mutex holder process[%d:%s] is not running now:\n", locker->pid, locker->comm); + show_stack(locker, NULL); + printk("----\n"); + } + }else{ + printk("Hint: wait on mutex, but holder already released lock\n"); + } + } +} +#endif +void sched_show_task(struct task_struct *p) +{ + unsigned long free = 0; + int ppid; + unsigned state; + + state = p->state ? __ffs(p->state) + 1 : 0; + printk(KERN_INFO "%-15.15s %c", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); +#if BITS_PER_LONG == 32 + if (state == TASK_RUNNING) + printk(KERN_CONT " running "); + else + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(KERN_CONT " running task "); + else + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + free = stack_not_used(p); +#endif + rcu_read_lock(); + ppid = task_pid_nr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), ppid, + (unsigned long)task_thread_info(p)->flags); + + print_worker_info(KERN_INFO, p); + show_stack(p, NULL); +#ifdef CONFIG_MT_DEBUG_MUTEXES + mt_mutex_state(p); +#endif +} + +void show_state_filter(unsigned long state_filter) +{ + struct task_struct *g, *p; + +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); +#else + printk(KERN_INFO + " task PC stack pid father\n"); +#endif + rcu_read_lock(); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take a lot of time: + */ + touch_nmi_watchdog(); + if (!state_filter || (p->state & state_filter)) + sched_show_task(p); + } while_each_thread(g, p); + + touch_all_softlockup_watchdogs(); + +#ifdef CONFIG_SCHED_DEBUG + sysrq_sched_debug_show(); +#endif + rcu_read_unlock(); + /* + * Only show locks if all tasks are dumped: + */ + if (!state_filter) + debug_show_all_locks(); +} + +void __cpuinit init_idle_bootup_task(struct task_struct *idle) +{ + idle->sched_class = &idle_sched_class; +} + +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question + * @cpu: cpu the idle task belongs to + * + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. + */ +void __cpuinit init_idle(struct task_struct *idle, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + + __sched_fork(idle); + idle->state = TASK_RUNNING; + idle->se.exec_start = sched_clock(); + + do_set_cpus_allowed(idle, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); + __set_task_cpu(idle, cpu); + rcu_read_unlock(); + + rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) + idle->on_cpu = 1; +#endif + raw_spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ + task_thread_info(idle)->preempt_count = 0; + + /* + * The idle tasks have their own, simple scheduling class: + */ + idle->sched_class = &idle_sched_class; + ftrace_graph_init_idle_task(idle, cpu); + vtime_init_idle(idle, cpu); +#if defined(CONFIG_SMP) + sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif +} + +#ifdef CONFIG_SMP +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class && p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); +} + +/* + * This is how migration works: + * + * 1) we invoke migration_cpu_stop() on the target CPU using + * stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + * off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 5) stopper completes and stop_one_cpu() returns and the migration + * is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + unsigned long flags; + struct rq *rq; + unsigned int dest_cpu; + int ret = 0; + + rq = task_rq_lock(p, &flags); + + if (cpumask_equal(&p->cpus_allowed, new_mask)) + goto out; + + if (!cpumask_intersects(new_mask, cpu_active_mask)) { + ret = -EINVAL; + printk_deferred("SCHED: intersects new_mask: %lu, cpu_active_mask: %lu\n", new_mask->bits[0], cpu_active_mask->bits[0]); + goto out; + } + + do_set_cpus_allowed(p, new_mask); + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), new_mask)) + goto out; + + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (p->on_rq) { + struct migration_arg arg = { p, dest_cpu }; + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, p, &flags); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + return 0; + } +out: + task_rq_unlock(rq, p, &flags); + + return ret; +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + * + * Returns non-zero if task was successfully migrated. + */ +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ + struct rq *rq_dest, *rq_src; + int ret = 0; + + if (unlikely(!cpu_active(dest_cpu))) + return ret; + + rq_src = cpu_rq(src_cpu); + rq_dest = cpu_rq(dest_cpu); + + raw_spin_lock(&p->pi_lock); + double_rq_lock(rq_src, rq_dest); + /* Already moved. */ + if (task_cpu(p) != src_cpu) + goto done; + /* Affinity changed (again). */ + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + goto fail; + + /* + * If we're not on a rq, the next wake-up will ensure we're + * placed properly. + */ + if (p->on_rq) { + dequeue_task(rq_src, p, 0); + set_task_cpu(p, dest_cpu); + enqueue_task(rq_dest, p, 0); + check_preempt_curr(rq_dest, p, 0); + } +done: + ret = 1; +fail: + double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&p->pi_lock); + return ret; +} + +/* + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. + */ +static int migration_cpu_stop(void *data) +{ + struct migration_arg *arg = data; + + /* + * The original target cpu might have gone down and we might + * be on another cpu but it doesn't matter. + */ + local_irq_disable(); + __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); + local_irq_enable(); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ + struct mm_struct *mm = current->active_mm; + + BUG_ON(cpu_online(smp_processor_id())); + + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); +} + +/* + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations". + */ +static void calc_load_migrate(struct rq *rq) +{ + long delta = calc_load_fold_active(rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); +} + +/* + * Migrate all tasks from the rq, sleeping tasks will be migrated by + * try_to_wake_up()->select_task_rq(). + * + * Called with rq->lock held even though we'er in stop_machine() and + * there's no concurrency possible, we hold the required locks anyway + * because of lock validation efforts. + */ +static void migrate_tasks(unsigned int dead_cpu) +{ + struct rq *rq = cpu_rq(dead_cpu); + struct task_struct *next, *stop = rq->stop; + int dest_cpu; + + /* + * Fudge the rq selection such that the below task selection loop + * doesn't get stuck on the currently eligible stop task. + * + * We're currently inside stop_machine() and the rq is either stuck + * in the stop_machine_cpu_stop() loop, or we're executing this code, + * either way we should never end up calling schedule() until we're + * done here. + */ + rq->stop = NULL; + /* MTK patch: prevent could not migrate RT task when RT throttle*/ + unthrottle_offline_rt_rqs(rq); + + for ( ; ; ) { + /* + * There's this thread running, bail when that's the only + * remaining thread. + */ + if (rq->nr_running == 1) + break; + + next = pick_next_task(rq); + BUG_ON(!next); + next->sched_class->put_prev_task(rq, next); + + /* Find suitable destination for @next, with force if needed. */ + dest_cpu = select_fallback_rq(dead_cpu, next); + raw_spin_unlock(&rq->lock); + + __migrate_task(next, dead_cpu, dest_cpu); + + raw_spin_lock(&rq->lock); + } + + rq->stop = stop; +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {} +}; + +static struct ctl_table sd_ctl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + umode_t mode, proc_handler *proc_handler, + bool load_idx) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(13); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring, false); + /* &table[12] is terminator */ + + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_possible_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_possible_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +static void unregister_sched_domain_sysctl(void) +{ + if (sd_sysctl_header) + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#else +static void register_sched_domain_sysctl(void) +{ +} +static void unregister_sched_domain_sysctl(void) +{ +} +#endif + +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpumask_set_cpu(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpumask_clear_cpu(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int __cpuinit +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + unsigned long flags; + struct rq *rq = cpu_rq(cpu); + + switch (action & ~CPU_TASKS_FROZEN) { + + case CPU_UP_PREPARE: + rq->calc_load_update = calc_load_update; + break; + + case CPU_ONLINE: + /* Update our root-domain */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DYING: + sched_ttwu_pending(); + /* Update our root-domain */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + migrate_tasks(cpu); + BUG_ON(rq->nr_running != 1); /* the migration thread */ + raw_spin_unlock_irqrestore(&rq->lock, flags); + break; + + case CPU_DEAD: + calc_load_migrate(rq); + break; +#endif + } + + update_max_interval(); + + return NOTIFY_OK; +} + +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else. This has to be lower priority than + * the notifier in the perf_event subsystem, though. + */ +static struct notifier_block __cpuinitdata migration_notifier = { + .notifier_call = migration_call, + .priority = CPU_PRI_MIGRATION, +}; + +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + set_cpu_active((long)hcpu, true); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + set_cpu_active((long)hcpu, false); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __init migration_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + /* Initialize migration for the boot CPU */ + err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); + migration_call(&migration_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&migration_notifier); + + /* Register cpu active notifiers */ + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + + return 0; +} +early_initcall(migration_init); +#endif + +#ifdef CONFIG_SMP + +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + +#ifdef CONFIG_SCHED_DEBUG + +static __read_mostly int sched_debug_enabled; + +static int __init sched_debug_setup(char *str) +{ + sched_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ + return sched_debug_enabled; +} + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + struct cpumask *groupmask) +{ + struct sched_group *group = sd->groups; + char str[256]; + + cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); + cpumask_clear(groupmask); + + printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); + return -1; + } + + printk(KERN_CONT "span %s level %s\n", str, sd->name); + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); + } + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); + } + + printk(KERN_DEBUG "%*s groups:", level + 1, ""); + do { + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); + break; + } + + /* + * Even though we initialize ->power to something semi-sane, + * we leave power_orig unset. This allows us to detect if + * domain iteration is still funny without causing /0 traps. + */ + if (!group->sgp->power_orig) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); + break; + } + + if (!cpumask_weight(sched_group_cpus(group))) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: empty group\n"); + break; + } + + if (!(sd->flags & SD_OVERLAP) && + cpumask_intersects(groupmask, sched_group_cpus(group))) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + break; + } + + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + + cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); + + printk(KERN_CONT " %s", str); + if (group->sgp->power != SCHED_POWER_SCALE) { + printk(KERN_CONT " (cpu_power = %d)", + group->sgp->power); + } + + group = group->next; + } while (group != sd->groups); + printk(KERN_CONT "\n"); + + if (!cpumask_equal(sched_domain_span(sd), groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); + return 0; +} + +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + int level = 0; + + if (!sched_debug_enabled) + return; + + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } + + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + for (;;) { + if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) + break; + level++; + sd = sd->parent; + if (!sd) + break; + } +} +#else /* !CONFIG_SCHED_DEBUG */ +# define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ + return false; +} +#endif /* CONFIG_SCHED_DEBUG */ + +static int sd_degenerate(struct sched_domain *sd) +{ + if (cpumask_weight(sched_domain_span(sd)) == 1) + return 1; + + /* Following flags need at least 2 groups */ + if (sd->flags & (SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES)) { + if (sd->groups != sd->groups->next) + return 0; + } + + /* Following flags don't use groups */ + if (sd->flags & (SD_WAKE_AFFINE)) + return 0; + + return 1; +} + +static int +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) +{ + unsigned long cflags = sd->flags, pflags = parent->flags; + + if (sd_degenerate(parent)) + return 1; + + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) + return 0; + + /* Flags needing groups don't count if only 1 group in parent */ + if (parent->groups == parent->groups->next) { + pflags &= ~(SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES); + if (nr_node_ids == 1) + pflags &= ~SD_SERIALIZE; + } + if (~cflags & pflags) + return 0; + + return 1; +} + +static void free_rootdomain(struct rcu_head *rcu) +{ + struct root_domain *rd = container_of(rcu, struct root_domain, rcu); + + cpupri_cleanup(&rd->cpupri); + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + +static void rq_attach_root(struct rq *rq, struct root_domain *rd) +{ + struct root_domain *old_rd = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + old_rd = rq->rd; + + if (cpumask_test_cpu(rq->cpu, old_rd->online)) + set_rq_offline(rq); + + cpumask_clear_cpu(rq->cpu, old_rd->span); + + /* + * If we dont want to free the old_rt yet then + * set old_rd to NULL to skip the freeing later + * in this function: + */ + if (!atomic_dec_and_test(&old_rd->refcount)) + old_rd = NULL; + } + + atomic_inc(&rd->refcount); + rq->rd = rd; + + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + if (old_rd) + call_rcu_sched(&old_rd->rcu, free_rootdomain); +} + +static int init_rootdomain(struct root_domain *rd) +{ + memset(rd, 0, sizeof(*rd)); + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; + + if (cpupri_init(&rd->cpupri) != 0) + goto free_rto_mask; + return 0; + +free_rto_mask: + free_cpumask_var(rd->rto_mask); +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +out: + return -ENOMEM; +} + +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +struct root_domain def_root_domain; + +static void init_defrootdomain(void) +{ + init_rootdomain(&def_root_domain); + + atomic_set(&def_root_domain.refcount, 1); +} + +static struct root_domain *alloc_rootdomain(void) +{ + struct root_domain *rd; + + rd = kmalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return NULL; + + if (init_rootdomain(rd) != 0) { + kfree(rd); + return NULL; + } + + return rd; +} + +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ + struct sched_group *tmp, *first; + + if (!sg) + return; + + first = sg; + do { + tmp = sg->next; + + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) + kfree(sg->sgp); + + kfree(sg); + sg = tmp; + } while (sg != first); +} + +static void free_sched_domain(struct rcu_head *rcu) +{ + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + + /* + * If its an overlapping domain it has private groups, iterate and + * nuke them all. + */ + if (sd->flags & SD_OVERLAP) { + free_sched_groups(sd->groups, 1); + } else if (atomic_dec_and_test(&sd->groups->ref)) { + kfree(sd->groups->sgp); + kfree(sd->groups); + } + kfree(sd); +} + +static void destroy_sched_domain(struct sched_domain *sd, int cpu) +{ + call_rcu(&sd->rcu, free_sched_domain); +} + +static void destroy_sched_domains(struct sched_domain *sd, int cpu) +{ + for (; sd; sd = sd->parent) + destroy_sched_domain(sd, cpu); +} + +/* + * Keep a special pointer to the highest sched_domain that has + * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this + * allows us to avoid some pointer chasing select_idle_sibling(). + * + * Also keep a unique ID per domain (we use the first cpu number in + * the cpumask of the domain), this allows us to quickly tell if + * two cpus are in the same cache domain, see cpus_share_cache(). + */ +DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_id); + +static void update_top_cache_domain(int cpu) +{ + struct sched_domain *sd; + int id = cpu; + + sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); + per_cpu(sd_llc_id, cpu) = id; +} + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +static void +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct sched_domain *tmp; + + /* Remove the sched domains which do not contribute to scheduling. */ + for (tmp = sd; tmp; ) { + struct sched_domain *parent = tmp->parent; + if (!parent) + break; + + if (sd_parent_degenerate(tmp, parent)) { + tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + destroy_sched_domain(parent, cpu); + } else + tmp = tmp->parent; + } + + if (sd && sd_degenerate(sd)) { + tmp = sd; + sd = sd->parent; + destroy_sched_domain(tmp, cpu); + if (sd) + sd->child = NULL; + } + + sched_domain_debug(sd, cpu); + + rq_attach_root(rq, rd); + tmp = rq->sd; + rcu_assign_pointer(rq->sd, sd); + destroy_sched_domains(tmp, cpu); + +#if defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK) || defined (CONFIG_HMP_PACK_SMALL_TASK) + update_packing_domain(cpu); +#endif /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK || CONFIG_HMP_PACK_SMALL_TASK */ + update_top_cache_domain(cpu); +} + +/* cpus with isolated domains */ +static cpumask_var_t cpu_isolated_map; + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + alloc_bootmem_cpumask_var(&cpu_isolated_map); + cpulist_parse(str, cpu_isolated_map); + return 1; +} + +__setup("isolcpus=", isolated_cpu_setup); + +static const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} + +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; +}; + +struct s_data { + struct sched_domain ** __percpu sd; + struct root_domain *rd; +}; + +enum s_alloc { + sa_rootdomain, + sa_sd, + sa_sd_storage, + sa_none, +}; + +struct sched_domain_topology_level; + +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + +#define SDTL_OVERLAP 0x01 + +struct sched_domain_topology_level { + sched_domain_init_f init; + sched_domain_mask_f mask; + int flags; + int numa_level; + struct sd_data data; +}; + +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * cpu they're built on, so check that. + * + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ + const struct cpumask *span = sched_domain_span(sd); + struct sd_data *sdd = sd->private; + struct sched_domain *sibling; + int i; + + for_each_cpu(i, span) { + sibling = *per_cpu_ptr(sdd->sd, i); + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + continue; + + cpumask_set_cpu(i, sched_group_mask(sg)); + } +} + +/* + * Return the canonical balance cpu for this group, this is the first cpu + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} + +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered = sched_domains_tmpmask; + struct sd_data *sdd = sd->private; + struct sched_domain *child; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct cpumask *sg_span; + + if (cpumask_test_cpu(i, covered)) + continue; + + child = *per_cpu_ptr(sdd->sd, i); + + /* See the comment near build_group_mask(). */ + if (!cpumask_test_cpu(i, sched_domain_span(child))) + continue; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(cpu)); + + if (!sg) + goto fail; + + sg_span = sched_group_cpus(sg); + if (child->child) { + child = child->child; + cpumask_copy(sg_span, sched_domain_span(child)); + } else + cpumask_set_cpu(i, sg_span); + + cpumask_or(covered, covered, sg_span); + + sg->sgp = *per_cpu_ptr(sdd->sgp, i); + if (atomic_inc_return(&sg->sgp->ref) == 1) + build_group_mask(sd, sg); + + /* + * Initialize sgp->power such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); + + /* + * Make sure the first group of this domain contains the + * canonical balance cpu. Otherwise the sched_domain iteration + * breaks. See update_sg_lb_stats(). + */ + if ((!groups && cpumask_test_cpu(cpu, sg_span)) || + group_balance_cpu(sg) == cpu) + groups = sg; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + last->next = first; + } + sd->groups = groups; + + return 0; + +fail: + free_sched_groups(first, 0); + + return -ENOMEM; +} + +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) +{ + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + struct sched_domain *child = sd->child; + + if (child) + cpu = cpumask_first(sched_domain_span(child)); + + if (sg) { + *sg = *per_cpu_ptr(sdd->sg, cpu); + (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + } + + return cpu; +} + +/* + * build_sched_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + * + * Assumes the sched_domain tree is fully constructed + */ +static int +build_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL; + struct sd_data *sdd = sd->private; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered; + int i; + + get_group(cpu, sdd, &sd->groups); + atomic_inc(&sd->groups->ref); + + if (cpu != cpumask_first(sched_domain_span(sd))) + return 0; + + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct sched_group *sg; + int group = get_group(i, sdd, &sg); + int j; + + if (cpumask_test_cpu(i, covered)) + continue; + + cpumask_clear(sched_group_cpus(sg)); + sg->sgp->power = 0; + cpumask_setall(sched_group_mask(sg)); + + for_each_cpu(j, span) { + if (get_group(j, sdd, NULL) != group) + continue; + + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); + } + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; + + return 0; +} + +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ + struct sched_group *sg = sd->groups; + + WARN_ON(!sd || !sg); + + do { + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg = sg->next; + } while (sg != sd->groups); + + if (cpu != group_balance_cpu(sg)) + return; + + update_group_power(sd, cpu); + atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} + +#if defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK) || defined (CONFIG_HMP_PACK_SMALL_TASK) +int __weak arch_sd_share_power_line(void) +{ + return 0*SD_SHARE_POWERLINE; +} +#endif /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK || CONFIG_HMP_PACK_SMALL_TASK */ +/* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + +#define SD_INIT_FUNC(type) \ +static noinline struct sched_domain * \ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ +{ \ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ + *sd = SD_##type##_INIT; \ + SD_INIT_NAME(sd, type); \ + sd->private = &tl->data; \ + return sd; \ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif + +static int default_relax_domain_level = -1; +int sched_domain_level_max; + +static int __init setup_relax_domain_level(char *str) +{ + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); + + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* turn off idle balance on this domain */ + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } else { + /* turn on idle balance on this domain */ + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } +} + +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, + const struct cpumask *cpu_map) +{ + switch (what) { + case sa_rootdomain: + if (!atomic_read(&d->rd->refcount)) + free_rootdomain(&d->rd->rcu); /* fall through */ + case sa_sd: + free_percpu(d->sd); /* fall through */ + case sa_sd_storage: + __sdt_free(cpu_map); /* fall through */ + case sa_none: + break; + } +} + +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, + const struct cpumask *cpu_map) +{ + memset(d, 0, sizeof(*d)); + + if (__sdt_alloc(cpu_map)) + return sa_sd_storage; + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) + return sa_sd_storage; + d->rd = alloc_rootdomain(); + if (!d->rd) + return sa_sd; + return sa_rootdomain; +} + +/* + * NULL the sd_data elements we've used to build the sched_domain and + * sched_group structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) +{ + struct sd_data *sdd = sd->private; + + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) + *per_cpu_ptr(sdd->sg, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) + *per_cpu_ptr(sdd->sgp, cpu) = NULL; +} + +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *cpu_smt_mask(int cpu) +{ + return topology_thread_cpumask(cpu); +} +#endif + +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT + { sd_init_SIBLING, cpu_smt_mask, }, +#endif +#ifdef CONFIG_SCHED_MC + { sd_init_MC, cpu_coregroup_mask, }, +#endif +#ifdef CONFIG_SCHED_BOOK + { sd_init_BOOK, cpu_book_mask, }, +#endif + { sd_init_CPU, cpu_cpu_mask, }, + { NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#ifdef CONFIG_NUMA + +static int sched_domains_numa_levels; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; +static int sched_domains_curr_level; + +static inline int sd_local_flags(int level) +{ + if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) + return 0; + + return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; +} + +static struct sched_domain * +sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +{ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); + int level = tl->numa_level; + int sd_weight = cpumask_weight( + sched_domains_numa_masks[level][cpu_to_node(cpu)]); + + *sd = (struct sched_domain){ + .min_interval = sd_weight, + .max_interval = 2*sd_weight, + .busy_factor = 32, + .imbalance_pct = 125, + .cache_nice_tries = 2, + .busy_idx = 3, + .idle_idx = 2, + .newidle_idx = 0, + .wake_idx = 0, + .forkexec_idx = 0, + + .flags = 1*SD_LOAD_BALANCE + | 1*SD_BALANCE_NEWIDLE + | 0*SD_BALANCE_EXEC + | 0*SD_BALANCE_FORK + | 0*SD_BALANCE_WAKE + | 0*SD_WAKE_AFFINE + | 0*SD_SHARE_CPUPOWER + | 0*SD_SHARE_PKG_RESOURCES + | 1*SD_SERIALIZE + | 0*SD_PREFER_SIBLING + | sd_local_flags(level) + , + .last_balance = jiffies, + .balance_interval = sd_weight, + }; + SD_INIT_NAME(sd, NUMA); + sd->private = &tl->data; + + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; + + return sd; +} + +static const struct cpumask *sd_numa_mask(int cpu) +{ + return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; +} + +static void sched_numa_warn(const char *str) +{ + static int done = false; + int i,j; + + if (done) + return; + + done = true; + + printk(KERN_WARNING "ERROR: %s\n\n", str); + + for (i = 0; i < nr_node_ids; i++) { + printk(KERN_WARNING " "); + for (j = 0; j < nr_node_ids; j++) + printk(KERN_CONT "%02d ", node_distance(i,j)); + printk(KERN_CONT "\n"); + } + printk(KERN_WARNING "\n"); +} + +static bool find_numa_distance(int distance) +{ + int i; + + if (distance == node_distance(0, 0)) + return true; + + for (i = 0; i < sched_domains_numa_levels; i++) { + if (sched_domains_numa_distance[i] == distance) + return true; + } + + return false; +} + +static void sched_init_numa(void) +{ + int next_distance, curr_distance = node_distance(0, 0); + struct sched_domain_topology_level *tl; + int level = 0; + int i, j, k; + + sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + if (!sched_domains_numa_distance) + return; + + /* + * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * unique distances in the node_distance() table. + * + * Assumes node_distance(0,j) includes all distances in + * node_distance(i,j) in order to avoid cubic time. + */ + next_distance = curr_distance; + for (i = 0; i < nr_node_ids; i++) { + for (j = 0; j < nr_node_ids; j++) { + for (k = 0; k < nr_node_ids; k++) { + int distance = node_distance(i, k); + + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + + /* + * While not a strong assumption it would be nice to know + * about cases where if node A is connected to B, B is not + * equally connected to A. + */ + if (sched_debug() && node_distance(k, i) != distance) + sched_numa_warn("Node-distance not symmetric"); + + if (sched_debug() && i && !find_numa_distance(distance)) + sched_numa_warn("Node-0 not representative"); + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; + } + + /* + * In case of sched_debug() we verify the above assumption. + */ + if (!sched_debug()) + break; + } + /* + * 'level' contains the number of unique distances, excluding the + * identity distance node_distance(i,i). + * + * The sched_domains_numa_distance[] array includes the actual distance + * numbers. + */ + + /* + * Here, we should temporarily reset sched_domains_numa_levels to 0. + * If it fails to allocate memory for array sched_domains_numa_masks[][], + * the array will contain less then 'level' members. This could be + * dangerous when we use it to iterate array sched_domains_numa_masks[][] + * in other functions. + * + * We reset it to 'level' at the end of this function. + */ + sched_domains_numa_levels = 0; + + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + if (!sched_domains_numa_masks) + return; + + /* + * Now for each level, construct a mask per node which contains all + * cpus of nodes that are that many hops away from us. + */ + for (i = 0; i < level; i++) { + sched_domains_numa_masks[i] = + kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!sched_domains_numa_masks[i]) + return; + + for (j = 0; j < nr_node_ids; j++) { + struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!mask) + return; + + sched_domains_numa_masks[i][j] = mask; + + for (k = 0; k < nr_node_ids; k++) { + if (node_distance(j, k) > sched_domains_numa_distance[i]) + continue; + + cpumask_or(mask, mask, cpumask_of_node(k)); + } + } + } + + tl = kzalloc((ARRAY_SIZE(default_topology) + level) * + sizeof(struct sched_domain_topology_level), GFP_KERNEL); + if (!tl) + return; + + /* + * Copy the default topology bits.. + */ + for (i = 0; default_topology[i].init; i++) + tl[i] = default_topology[i]; + + /* + * .. and append 'j' levels of NUMA goodness. + */ + for (j = 0; j < level; i++, j++) { + tl[i] = (struct sched_domain_topology_level){ + .init = sd_numa_init, + .mask = sd_numa_mask, + .flags = SDTL_OVERLAP, + .numa_level = j, + }; + } + + sched_domain_topology = tl; + + sched_domains_numa_levels = level; +} + +static void sched_domains_numa_masks_set(int cpu) +{ + int i, j; + int node = cpu_to_node(cpu); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (node_distance(j, node) <= sched_domains_numa_distance[i]) + cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); + } + } +} + +static void sched_domains_numa_masks_clear(int cpu) +{ + int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } +} + +/* + * Update sched_domains_numa_masks[level][node] array when new cpus + * are onlined. + */ +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + sched_domains_numa_masks_set(cpu); + break; + + case CPU_DEAD: + sched_domains_numa_masks_clear(cpu); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} +#else +static inline void sched_init_numa(void) +{ +} + +static int sched_domains_numa_masks_update(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + return 0; +} +#endif /* CONFIG_NUMA */ + +static int __sdt_alloc(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + sdd->sd = alloc_percpu(struct sched_domain *); + if (!sdd->sd) + return -ENOMEM; + + sdd->sg = alloc_percpu(struct sched_group *); + if (!sdd->sg) + return -ENOMEM; + + sdd->sgp = alloc_percpu(struct sched_group_power *); + if (!sdd->sgp) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + struct sched_group *sg; + struct sched_group_power *sgp; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return -ENOMEM; + + *per_cpu_ptr(sdd->sd, j) = sd; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sg) + return -ENOMEM; + + sg->next = sg; + + *per_cpu_ptr(sdd->sg, j) = sg; + + sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sgp) + return -ENOMEM; + + *per_cpu_ptr(sdd->sgp, j) = sgp; + } + } + + return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + + if (sdd->sd) { + sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); + } + + if (sdd->sg) + kfree(*per_cpu_ptr(sdd->sg, j)); + if (sdd->sgp) + kfree(*per_cpu_ptr(sdd->sgp, j)); + } + free_percpu(sdd->sd); + sdd->sd = NULL; + free_percpu(sdd->sg); + sdd->sg = NULL; + free_percpu(sdd->sgp); + sdd->sgp = NULL; + } +} + +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, + struct s_data *d, const struct cpumask *cpu_map, + struct sched_domain_attr *attr, struct sched_domain *child, + int cpu) +{ + struct sched_domain *sd = tl->init(tl, cpu); + if (!sd) + return child; + + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + if (child) { + sd->level = child->level + 1; + sched_domain_level_max = max(sched_domain_level_max, sd->level); + child->parent = sd; + } + sd->child = child; + set_domain_attribute(sd, attr); + + return sd; +} + +/* + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus + */ +static int build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) +{ + enum s_alloc alloc_state = sa_none; + struct sched_domain *sd; + struct s_data d; + int i, ret = -ENOMEM; + + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) + goto error; + + /* Set up domains for cpus specified by the cpu_map. */ + for_each_cpu(i, cpu_map) { + struct sched_domain_topology_level *tl; + + sd = NULL; + for (tl = sched_domain_topology; tl->init; tl++) { + sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + sd->flags |= SD_OVERLAP; + if (cpumask_equal(cpu_map, sched_domain_span(sd))) + break; + } + + while (sd->child) + sd = sd->child; + + *per_cpu_ptr(d.sd, i) = sd; + } + + /* Build the groups for the domains */ + for_each_cpu(i, cpu_map) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + sd->span_weight = cpumask_weight(sched_domain_span(sd)); + if (sd->flags & SD_OVERLAP) { + if (build_overlap_sched_groups(sd, i)) + goto error; + } else { + if (build_sched_groups(sd, i)) + goto error; + } + } + } + + /* Calculate CPU power for physical packages and nodes */ + for (i = nr_cpumask_bits-1; i >= 0; i--) { + if (!cpumask_test_cpu(i, cpu_map)) + continue; + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + claim_allocations(i, sd); + init_sched_groups_power(i, sd); + } + } + + /* Attach the domains */ + rcu_read_lock(); + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d.sd, i); + cpu_attach_domain(sd, d.rd, i); + } + rcu_read_unlock(); + + ret = 0; +error: + __free_domain_allocs(&d, alloc_state, cpu_map); + return ret; +} + +static cpumask_var_t *doms_cur; /* current sched domains */ +static int ndoms_cur; /* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; + /* attribues of custom domains in 'doms_cur' */ + +/* + * Special case: If a kmalloc of a doms_cur partition (array of + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. + */ +static cpumask_var_t fallback_doms; + +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __attribute__((weak)) arch_update_cpu_topology(void) +{ + return 0; +} + +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ +static int init_sched_domains(const struct cpumask *cpu_map) +{ + int err; + + arch_update_cpu_topology(); + ndoms_cur = 1; + doms_cur = alloc_sched_domains(ndoms_cur); + if (!doms_cur) + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); + err = build_sched_domains(doms_cur[0], NULL); + register_sched_domain_sysctl(); + + return err; +} + +/* + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain + */ +static void detach_destroy_domains(const struct cpumask *cpu_map) +{ + int i; + + rcu_read_lock(); + for_each_cpu(i, cpu_map) + cpu_attach_domain(NULL, &def_root_domain, i); + rcu_read_unlock(); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, + struct sched_domain_attr *new, int idx_new) +{ + struct sched_domain_attr tmp; + + /* fast path */ + if (!new && !cur) + return 1; + + tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, + new ? (new + idx_new) : &tmp, + sizeof(struct sched_domain_attr)); +} + +/* + * Partition sched domains as specified by the 'ndoms_new' + * cpumasks in the array doms_new[] of cpumasks. This compares + * doms_new[] to the current sched domain partitioning, doms_cur[]. + * It destroys each deleted domain and builds each new domain. + * + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the + * current 'doms_cur' domains and in the new 'doms_new', we can leave + * it as it is. + * + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. + * + * If doms_new == NULL it will be replaced with cpu_online_mask. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. + * + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + int i, j, n; + int new_topology; + + mutex_lock(&sched_domains_mutex); + + /* always unregister in case we don't destroy any domains */ + unregister_sched_domain_sysctl(); + + /* Let architecture update cpu core mappings. */ + new_topology = arch_update_cpu_topology(); + + n = doms_new ? ndoms_new : 0; + + /* Destroy deleted domains */ + for (i = 0; i < ndoms_cur; i++) { + for (j = 0; j < n && !new_topology; j++) { + if (cpumask_equal(doms_cur[i], doms_new[j]) + && dattrs_equal(dattr_cur, i, dattr_new, j)) + goto match1; + } + /* no match - a current sched domain not in new doms_new[] */ + detach_destroy_domains(doms_cur[i]); +match1: + ; + } + + if (doms_new == NULL) { + ndoms_cur = 0; + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + WARN_ON_ONCE(dattr_new); + } + + /* Build new domains */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < ndoms_cur && !new_topology; j++) { + if (cpumask_equal(doms_new[i], doms_cur[j]) + && dattrs_equal(dattr_new, i, dattr_cur, j)) + goto match2; + } + /* no match - add a new doms_new */ + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); +match2: + ; + } + + /* Remember the new sched domains */ + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); + kfree(dattr_cur); /* kfree(NULL) is safe */ + doms_cur = doms_new; + dattr_cur = dattr_new; + ndoms_cur = ndoms_new; + + register_sched_domain_sysctl(); + + mutex_unlock(&sched_domains_mutex); +} + +static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ + +/* + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). + * + * If we come here as part of a suspend/resume, don't touch cpusets because we + * want to restore it back to its original state upon resume anyway. + */ +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action) { + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED_FROZEN: + + /* + * num_cpus_frozen tracks how many CPUs are involved in suspend + * resume sequence. As long as this is not the last online + * operation in the resume sequence, just build a single sched + * domain, ignoring cpusets. + */ + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); + break; + } + + /* + * This is the last CPU online operation. So fall through and + * restore the original sched domains by considering the + * cpuset configurations. + */ + + case CPU_ONLINE: + case CPU_DOWN_FAILED: + cpuset_update_active_cpus(true); + break; + default: + return NOTIFY_DONE; + } + return NOTIFY_OK; +} + +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action) { + case CPU_DOWN_PREPARE: + cpuset_update_active_cpus(false); + break; + case CPU_DOWN_PREPARE_FROZEN: + num_cpus_frozen++; + partition_sched_domains(1, NULL, NULL); + break; + default: + return NOTIFY_DONE; + } + return NOTIFY_OK; +} + +void __init sched_init_smp(void) +{ + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + + sched_init_numa(); + + get_online_cpus(); + mutex_lock(&sched_domains_mutex); + init_sched_domains(cpu_active_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); + mutex_unlock(&sched_domains_mutex); + put_online_cpus(); + + hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); + + /* RT runtime code needs to handle some hotplug events */ + hotcpu_notifier(update_runtime, 0); + + init_hrtick(); + + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) + BUG(); + sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); + + init_sched_rt_class(); +} +#else +void __init sched_init_smp(void) +{ + sched_init_granularity(); +} +#endif /* CONFIG_SMP */ + +const_debug unsigned int sysctl_timer_migration = 1; + +int in_sched_functions(unsigned long addr) +{ + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + +#ifdef CONFIG_CGROUP_SCHED +/* + * Default task group. + * Every task in system belongs to this group at bootup. + */ +struct task_group root_task_group; +LIST_HEAD(task_groups); +#endif + +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); + +void __init sched_init(void) +{ + int i, j; + unsigned long alloc_size = 0, ptr; + +#ifdef CONFIG_FAIR_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_RT_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_CPUMASK_OFFSTACK + alloc_size += num_possible_cpus() * cpumask_size(); +#endif + if (alloc_size) { + ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); + +#ifdef CONFIG_FAIR_GROUP_SCHED + root_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_CPUMASK_OFFSTACK + for_each_possible_cpu(i) { + per_cpu(load_balance_mask, i) = (void *)ptr; + ptr += cpumask_size(); + } +#endif /* CONFIG_CPUMASK_OFFSTACK */ + } + +#ifdef CONFIG_SMP + init_defrootdomain(); +#endif + + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED + init_rt_bandwidth(&root_task_group.rt_bandwidth, + global_rt_period(), global_rt_runtime()); +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_CGROUP_SCHED + list_add(&root_task_group.list, &task_groups); + INIT_LIST_HEAD(&root_task_group.children); + INIT_LIST_HEAD(&root_task_group.siblings); + autogroup_init(&init_task); + +#endif /* CONFIG_CGROUP_SCHED */ + + for_each_possible_cpu(i) { + struct rq *rq; + + rq = cpu_rq(i); + raw_spin_lock_init(&rq->lock); + rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; +#ifdef CONFIG_PROVE_LOCKING + rq->cpu = i; +#endif + init_cfs_rq(&rq->cfs); + init_rt_rq(&rq->rt, rq); +#ifdef CONFIG_FAIR_GROUP_SCHED + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + /* + * How much cpu bandwidth does root_task_group get? + * + * In case of task-groups formed thr' the cgroup filesystem, it + * gets 100% of the cpu resources in the system. This overall + * system cpu resource is divided among the tasks of + * root_task_group and its child task-groups in a fair manner, + * based on each entity's (task or task-group's) weight + * (se->load.weight). + * + * In other words, if root_task_group has 10 tasks of weight + * 1024) and two child groups A0 and A1 (of weight 1024 each), + * then A0's share of the cpu resource is: + * + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * + * We achieve this by letting root_task_group's tasks sit + * directly in rq->cfs (i.e root_task_group->se[] = NULL). + */ + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; +#ifdef CONFIG_RT_GROUP_SCHED + INIT_LIST_HEAD(&rq->leaf_rt_rq_list); + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); +#endif + + for (j = 0; j < CPU_LOAD_IDX_MAX; j++) + rq->cpu_load[j] = 0; + + rq->last_load_update_tick = jiffies; + +#ifdef CONFIG_SMP + rq->sd = NULL; + rq->rd = NULL; + rq->cpu_power = SCHED_POWER_SCALE; + rq->post_schedule = 0; + rq->active_balance = 0; + rq->next_balance = jiffies; + rq->push_cpu = 0; + rq->cpu = i; + rq->online = 0; + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; + + INIT_LIST_HEAD(&rq->cfs_tasks); + + rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ_COMMON + rq->nohz_flags = 0; +#endif +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = 0; +#endif +#endif + init_rq_hrtick(rq); + atomic_set(&rq->nr_iowait, 0); + } + + set_load_weight(&init_task); + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&init_task.pi_waiters); +#endif + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + + /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; + +#ifdef CONFIG_SMP + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + idle_thread_set_boot_cpu(); +#endif + init_sched_fair_class(); + + scheduler_running = 1; +} + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +static inline int preempt_count_equals(int preempt_offset) +{ + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + + return (nested == preempt_offset); +} + +static int __might_sleep_init_called; +int __init __might_sleep_init(void) +{ + __might_sleep_init_called = 1; + return 0; +} +early_initcall(__might_sleep_init); + +void __might_sleep(const char *file, int line, int preempt_offset) +{ + static unsigned long prev_jiffy; /* ratelimiting */ + + rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ + if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + oops_in_progress) + return; + if (system_state != SYSTEM_RUNNING && + (!__might_sleep_init_called || system_state != SYSTEM_BOOTING)) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +static void normalize_task(struct rq *rq, struct task_struct *p) +{ + const struct sched_class *prev_class = p->sched_class; + int old_prio = p->prio; + int on_rq; + + on_rq = p->on_rq; + if (on_rq) + dequeue_task(rq, p, 0); + __setscheduler(rq, p, SCHED_NORMAL, 0); + if (on_rq) { + enqueue_task(rq, p, 0); + resched_task(rq->curr); + } + + check_class_changed(rq, p, prev_class, old_prio); +} + +void normalize_rt_tasks(void) +{ + struct task_struct *g, *p; + unsigned long flags; + struct rq *rq; + + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, p) { + /* + * Only normalize user tasks: + */ + if (!p->mm) + continue; + + p->se.exec_start = 0; +#ifdef CONFIG_SCHEDSTATS + p->se.statistics.wait_start = 0; + p->se.statistics.sleep_start = 0; + p->se.statistics.block_start = 0; +#endif + + if (!rt_task(p)) { + /* + * Renice negative nice level userspace + * tasks back to 0: + */ + if (TASK_NICE(p) < 0 && p->mm) + set_user_nice(p, 0); + continue; + } + + raw_spin_lock(&p->pi_lock); + rq = __task_rq_lock(p); + + normalize_task(rq, p); + + __task_rq_unlock(rq); + raw_spin_unlock(&p->pi_lock); + } while_each_thread(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); +} + +#endif /* CONFIG_MAGIC_SYSRQ */ + +#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) +/* + * These functions are only useful for the IA64 MCA handling, or kdb. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +struct task_struct *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ + +#ifdef CONFIG_IA64 +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, struct task_struct *p) +{ + cpu_curr(cpu) = p; +} + +#endif + +#ifdef CONFIG_CGROUP_SCHED +/* task_group_lock serializes the addition/removal of task groups */ +static DEFINE_SPINLOCK(task_group_lock); + +static void free_sched_group(struct task_group *tg) +{ + free_fair_sched_group(tg); + free_rt_sched_group(tg); + autogroup_free(tg); + kfree(tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(struct task_group *parent) +{ + struct task_group *tg; + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + if (!alloc_fair_sched_group(tg, parent)) + goto err; + + if (!alloc_rt_sched_group(tg, parent)) + goto err; + + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +void sched_online_group(struct task_group *tg, struct task_group *parent) +{ + unsigned long flags; + + spin_lock_irqsave(&task_group_lock, flags); + list_add_rcu(&tg->list, &task_groups); + + WARN_ON(!parent); /* root should already exist */ + + tg->parent = parent; + INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); + spin_unlock_irqrestore(&task_group_lock, flags); +} + +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group_rcu(struct rcu_head *rhp) +{ + /* now it should be safe to free those cfs_rqs */ + free_sched_group(container_of(rhp, struct task_group, rcu)); +} + +/* Destroy runqueue etc associated with a task group */ +void sched_destroy_group(struct task_group *tg) +{ + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group_rcu); +} + +void sched_offline_group(struct task_group *tg) +{ + unsigned long flags; + int i; + + /* end participation in shares distribution */ + for_each_possible_cpu(i) + unregister_fair_sched_group(tg, i); + + spin_lock_irqsave(&task_group_lock, flags); + list_del_rcu(&tg->list); + list_del_rcu(&tg->siblings); + spin_unlock_irqrestore(&task_group_lock, flags); +} + +/* change task's runqueue when it moves between groups. + * The caller of this function should have put the task in its new group + * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + * reflect its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + struct task_group *tg; + int on_rq, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + running = task_current(rq, tsk); + on_rq = tsk->on_rq; + + if (on_rq) + dequeue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); + + tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + lockdep_is_held(&tsk->sighand->siglock)), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tsk->sched_class->task_move_group) + tsk->sched_class->task_move_group(tsk, on_rq); + else +#endif + set_task_rq(tsk, task_cpu(tsk)); + + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, tsk, 0); + + task_rq_unlock(rq, tsk, &flags); +} +#endif /* CONFIG_CGROUP_SCHED */ + +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) +static unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + return div64_u64(runtime << 20, period); +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ + struct task_struct *g, *p; + + do_each_thread(g, p) { + if (rt_task(p) && task_rq(p)->rt.tg == tg) + return 1; + } while_each_thread(g, p); + + return 0; +} + +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; + +static int tg_rt_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; + + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); + } + + if (sum > total) + return -EINVAL; + + return 0; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + int ret; + + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + rcu_read_lock(); + ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_unlock(); + + return ret; +} + +static int tg_set_rt_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) +{ + int i, err = 0; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) + goto unlock; + + raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); +unlock: + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return err; +} + +static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +static long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_bandwidth.rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} + +static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + if (rt_period == 0) + return -EINVAL; + + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +static long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + u64 runtime, period; + int ret = 0; + + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return ret; +} + +static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* make sure that internally we keep jiffies */ + /* also, writing zero resets timeslice to default */ + if (!ret && write) { + sched_rr_timeslice = sched_rr_timeslice <= 0 ? + RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + +int sched_rt_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_constraints(); + if (ret) { + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } else { + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = + ns_to_ktime(global_rt_period()); + } + } + mutex_unlock(&mutex); + + return ret; +} + +#ifdef CONFIG_CGROUP_SCHED + +/* return corresponding task_group object of a cgroup */ +static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), + struct task_group, css); +} + +static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) +{ + struct task_group *tg, *parent; + + if (!cgrp->parent) { + /* This is early initialization for the top cgroup */ + return &root_task_group.css; + } + + parent = cgroup_tg(cgrp->parent); + tg = sched_create_group(parent); + if (IS_ERR(tg)) + return ERR_PTR(-ENOMEM); + + return &tg->css; +} + +static int cpu_cgroup_css_online(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct task_group *parent; + + if (!cgrp->parent) + return 0; + + parent = cgroup_tg(cgrp->parent); + sched_online_group(tg, parent); + return 0; +} + +static void cpu_cgroup_css_free(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_destroy_group(tg); +} + +static void cpu_cgroup_css_offline(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_offline_group(tg); +} + +static int cpu_cgroup_can_attach(struct cgroup *cgrp, + struct cgroup_taskset *tset) +{ + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) { +#ifdef CONFIG_RT_GROUP_SCHED + if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + return -ERTGROUP; +#else + /* We don't support RT-tasks being in separate groups */ + if (task->sched_class != &fair_sched_class) + return -EINVAL; +#endif + } + return 0; +} + +static void cpu_cgroup_attach(struct cgroup *cgrp, + struct cgroup_taskset *tset) +{ + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) + sched_move_task(task); +} + +static void +cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + sched_move_task(task); +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 shareval) +{ + return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); +} + +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + + return (u64) scale_load_down(tg->shares); +} + +#ifdef CONFIG_CFS_BANDWIDTH +static DEFINE_MUTEX(cfs_constraints_mutex); + +const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ +const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); + +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +{ + int i, ret = 0, runtime_enabled, runtime_was_enabled; + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + if (tg == &root_task_group) + return -EINVAL; + + /* + * Ensure we have at some amount of bandwidth every period. This is + * to prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota < min_cfs_quota_period || period < min_cfs_quota_period) + return -EINVAL; + + /* + * Likewise, bound things on the otherside by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period > max_cfs_quota_period) + return -EINVAL; + + mutex_lock(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); + if (ret) + goto out_unlock; + + runtime_enabled = quota != RUNTIME_INF; + runtime_was_enabled = cfs_b->quota != RUNTIME_INF; + /* + * If we need to toggle cfs_bandwidth_used, off->on must occur + * before making related changes, and on->off must occur afterwards + */ + if (runtime_enabled && !runtime_was_enabled) + cfs_bandwidth_usage_inc(); + raw_spin_lock_irq(&cfs_b->lock); + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + + __refill_cfs_bandwidth_runtime(cfs_b); + /* restart the period timer (if active) to handle new period expiry */ + if (runtime_enabled && cfs_b->timer_active) { + /* force a reprogram */ + cfs_b->timer_active = 0; + __start_cfs_bandwidth(cfs_b); + } + raw_spin_unlock_irq(&cfs_b->lock); + + for_each_possible_cpu(i) { + struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + struct rq *rq = cfs_rq->rq; + + raw_spin_lock_irq(&rq->lock); + cfs_rq->runtime_enabled = runtime_enabled; + cfs_rq->runtime_remaining = 0; + + if (cfs_rq->throttled) + unthrottle_cfs_rq(cfs_rq); + raw_spin_unlock_irq(&rq->lock); + } + if (runtime_was_enabled && !runtime_enabled) + cfs_bandwidth_usage_dec(); +out_unlock: + mutex_unlock(&cfs_constraints_mutex); + + return ret; +} + +int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +{ + u64 quota, period; + + period = ktime_to_ns(tg->cfs_bandwidth.period); + if (cfs_quota_us < 0) + quota = RUNTIME_INF; + else + quota = (u64)cfs_quota_us * NSEC_PER_USEC; + + return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_quota(struct task_group *tg) +{ + u64 quota_us; + + if (tg->cfs_bandwidth.quota == RUNTIME_INF) + return -1; + + quota_us = tg->cfs_bandwidth.quota; + do_div(quota_us, NSEC_PER_USEC); + + return quota_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ + u64 quota, period; + + period = (u64)cfs_period_us * NSEC_PER_USEC; + quota = tg->cfs_bandwidth.quota; + + return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_period(struct task_group *tg) +{ + u64 cfs_period_us; + + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); + do_div(cfs_period_us, NSEC_PER_USEC); + + return cfs_period_us; +} + +static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_quota(cgroup_tg(cgrp)); +} + +static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, + s64 cfs_quota_us) +{ + return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 cfs_period_us) +{ + return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +struct cfs_schedulable_data { + struct task_group *tg; + u64 period, quota; +}; + +/* + * normalize group quota/period to be quota/max_period + * note: units are usecs + */ +static u64 normalize_cfs_quota(struct task_group *tg, + struct cfs_schedulable_data *d) +{ + u64 quota, period; + + if (tg == d->tg) { + period = d->period; + quota = d->quota; + } else { + period = tg_get_cfs_period(tg); + quota = tg_get_cfs_quota(tg); + } + + /* note: these should typically be equivalent */ + if (quota == RUNTIME_INF || quota == -1) + return RUNTIME_INF; + + return to_ratio(period, quota); +} + +static int tg_cfs_schedulable_down(struct task_group *tg, void *data) +{ + struct cfs_schedulable_data *d = data; + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + s64 quota = 0, parent_quota = -1; + + if (!tg->parent) { + quota = RUNTIME_INF; + } else { + struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; + + quota = normalize_cfs_quota(tg, d); + parent_quota = parent_b->hierarchal_quota; + + /* + * ensure max(child_quota) <= parent_quota, inherit when no + * limit is set + */ + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF && quota > parent_quota) + return -EINVAL; + } + cfs_b->hierarchal_quota = quota; + + return 0; +} + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) +{ + int ret; + struct cfs_schedulable_data data = { + .tg = tg, + .period = period, + .quota = quota, + }; + + if (quota != RUNTIME_INF) { + do_div(data.period, NSEC_PER_USEC); + do_div(data.quota, NSEC_PER_USEC); + } + + rcu_read_lock(); + ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); + rcu_read_unlock(); + + return ret; +} + +static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + cb->fill(cb, "nr_periods", cfs_b->nr_periods); + cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); + cb->fill(cb, "throttled_time", cfs_b->throttled_time); + + return 0; +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, + s64 val) +{ + return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); +} + +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_runtime(cgroup_tg(cgrp)); +} + +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_us) +{ + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_period(cgroup_tg(cgrp)); +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "shares", + .read_u64 = cpu_shares_read_u64, + .write_u64 = cpu_shares_write_u64, + }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .name = "cfs_quota_us", + .read_s64 = cpu_cfs_quota_read_s64, + .write_s64 = cpu_cfs_quota_write_s64, + }, + { + .name = "cfs_period_us", + .read_u64 = cpu_cfs_period_read_u64, + .write_u64 = cpu_cfs_period_write_u64, + }, + { + .name = "stat", + .read_map = cpu_stats_show, + }, +#endif +#ifdef CONFIG_RT_GROUP_SCHED + { + .name = "rt_runtime_us", + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, + }, +#endif + { } /* terminate */ +}; + +struct cgroup_subsys cpu_cgroup_subsys = { + .name = "cpu", + .css_alloc = cpu_cgroup_css_alloc, + .css_free = cpu_cgroup_css_free, + .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .allow_attach = subsys_cgroup_allow_attach, + .exit = cpu_cgroup_exit, + .subsys_id = cpu_cgroup_subsys_id, + .base_cftypes = cpu_files, + .early_init = 1, +}; + +#endif /* CONFIG_CGROUP_SCHED */ + +void dump_cpu_task(int cpu) +{ + pr_info("Task dump for CPU %d:\n", cpu); + sched_show_task(cpu_curr(cpu)); +} + +unsigned long long mt_get_thread_cputime(pid_t pid) +{ + struct task_struct *p; + p = pid ? find_task_by_vpid(pid) : current; + return task_sched_runtime(p); +} +unsigned long long mt_get_cpu_idle(int cpu) +{ + unsigned long long *unused = 0; + return get_cpu_idle_time_us(cpu, unused); +} +unsigned long long mt_sched_clock(void) +{ + return sched_clock(); +} +EXPORT_SYMBOL(mt_get_thread_cputime); +EXPORT_SYMBOL(mt_get_cpu_idle); +EXPORT_SYMBOL(mt_sched_clock); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 000000000..dbb7e2cd9 --- /dev/null +++ b/kernel/sched/cpuacct.c @@ -0,0 +1,296 @@ +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/seq_file.h> +#include <linux/rcupdate.h> +#include <linux/kernel_stat.h> +#include <linux/err.h> + +#include "sched.h" + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *__parent_ca(struct cpuacct *ca) +{ + return cgroup_ca(ca->css.cgroup->parent); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static struct cpuacct root_cpuacct = { + .cpustat = &kernel_cpustat, + .cpuusage = &root_cpuacct_cpuusage, +}; + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +{ + struct cpuacct *ca; + + if (!cgrp->parent) + return &root_cpuacct.css; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto out; + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) + goto out_free_ca; + + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; + + return &ca->css; + +out_free_cpuusage: + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void cpuacct_css_free(struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + + free_percpu(ca->cpustat); + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + } + + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, + { } /* terminate */ +}; + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + cpu = task_cpu(tsk); + + rcu_read_lock(); + + ca = task_ca(tsk); + + while (true) { + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + + ca = parent_ca(ca); + if (!ca) + break; + } + + rcu_read_unlock(); +} + +/* + * Add user/system time to cpuacct. + * + * Note: it's the caller that updates the account of the root cgroup. + */ +void cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; + + rcu_read_lock(); + ca = task_ca(p); + while (ca != &root_cpuacct) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += val; + ca = __parent_ca(ca); + } + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, + .early_init = 1, +}; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 000000000..ed605624a --- /dev/null +++ b/kernel/sched/cpuacct.h @@ -0,0 +1,17 @@ +#ifdef CONFIG_CGROUP_CPUACCT + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); + +#else + +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} + +static inline void +cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +} + +#endif diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c new file mode 100644 index 000000000..b3f0a2783 --- /dev/null +++ b/kernel/sched/cpupri.c @@ -0,0 +1,241 @@ +/* + * kernel/sched/cpupri.c + * + * CPU priority management + * + * Copyright (C) 2007-2008 Novell + * + * Author: Gregory Haskins <ghaskins@novell.com> + * + * This code tracks the priority of each CPU so that global migration + * decisions are easy to calculate. Each CPU can be in a state as follows: + * + * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * + * going from the lowest priority to the highest. CPUs in the INVALID state + * are not eligible for routing. The system maintains this state with + * a 2 dimensional bitmap (the first for priority class, the second for cpus + * in that class). Therefore a typical application without affinity + * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit + * searches). For tasks with affinity restrictions, the algorithm has a + * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * yields the worst case search is fairly contrived. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> +#include "cpupri.h" + +/* Convert between a 140 based task->prio, and our 102 based cpupri */ +static int convert_prio(int prio) +{ + int cpupri; + + if (prio == CPUPRI_INVALID) + cpupri = CPUPRI_INVALID; + else if (prio == MAX_PRIO) + cpupri = CPUPRI_IDLE; + else if (prio >= MAX_RT_PRIO) + cpupri = CPUPRI_NORMAL; + else + cpupri = MAX_RT_PRIO - prio + 1; + + return cpupri; +} + +/** + * cpupri_find - find the best (lowest-pri) CPU in the system + * @cp: The cpupri context + * @p: The task + * @lowest_mask: A mask to fill in with selected CPUs (or NULL) + * + * Note: This function returns the recommended CPUs as calculated during the + * current invocation. By the time the call returns, the CPUs may have in + * fact changed priorities any number of times. While not ideal, it is not + * an issue of correctness since the normal rebalancer logic will correct + * any discrepancies created by racing against the uncertainty of the current + * priority configuration. + * + * Returns: (int)bool - CPUs were found + */ +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask) +{ + int idx = 0; + int task_pri = convert_prio(p->prio); + + BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); + + for (idx = 0; idx < task_pri; idx++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + int skip = 0; + + if (!atomic_read(&(vec)->count)) + skip = 1; + /* + * When looking at the vector, we need to read the counter, + * do a memory barrier, then read the mask. + * + * Note: This is still all racey, but we can deal with it. + * Ideally, we only want to look at masks that are set. + * + * If a mask is not set, then the only thing wrong is that we + * did a little more work than necessary. + * + * If we read a zero count but the mask is set, because of the + * memory barriers, that can only happen when the highest prio + * task for a run queue has left the run queue, in which case, + * it will be followed by a pull. If the task we are processing + * fails to find a proper place to go, that pull request will + * pull this task if the run queue is running at a lower + * priority. + */ + smp_rmb(); + + /* Need to do the rmb for every iteration */ + if (skip) + continue; + + if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) + continue; + + if (lowest_mask) { + cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + + /* + * We have to ensure that we have at least one bit + * still set in the array, since the map could have + * been concurrently emptied between the first and + * second reads of vec->mask. If we hit this + * condition, simply act as though we never hit this + * priority level and continue on. + */ + if (cpumask_any(lowest_mask) >= nr_cpu_ids) + continue; + } + + return 1; + } + + return 0; +} + +/** + * cpupri_set - update the cpu priority setting + * @cp: The cpupri context + * @cpu: The target cpu + * @newpri: The priority (INVALID-RT99) to assign to this CPU + * + * Note: Assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpupri_set(struct cpupri *cp, int cpu, int newpri) +{ + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + int do_mb = 0; + + newpri = convert_prio(newpri); + + BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); + + if (newpri == oldpri) + return; + + /* + * If the cpu was currently mapped to a different value, we + * need to map it to the new value then remove the old value. + * Note, we must add the new value first, otherwise we risk the + * cpu being missed by the priority loop in cpupri_find. + */ + if (likely(newpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; + + cpumask_set_cpu(cpu, vec->mask); + /* + * When adding a new vector, we update the mask first, + * do a write memory barrier, and then update the count, to + * make sure the vector is visible when count is set. + */ + smp_mb__before_atomic_inc(); + atomic_inc(&(vec)->count); + do_mb = 1; + } + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + /* + * Because the order of modification of the vec->count + * is important, we must make sure that the update + * of the new prio is seen before we decrement the + * old prio. This makes sure that the loop sees + * one or the other when we raise the priority of + * the run queue. We don't care about when we lower the + * priority, as that will trigger an rt pull anyway. + * + * We only need to do a memory barrier if we updated + * the new priority vec. + */ + if (do_mb) + smp_mb__after_atomic_inc(); + + /* + * When removing from the vector, we decrement the counter first + * do a memory barrier and then clear the mask. + */ + atomic_dec(&(vec)->count); + smp_mb__after_atomic_inc(); + cpumask_clear_cpu(cpu, vec->mask); + } + + *currpri = newpri; +} + +/** + * cpupri_init - initialize the cpupri structure + * @cp: The cpupri context + * + * Returns: -ENOMEM if memory fails. + */ +int cpupri_init(struct cpupri *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[i]; + + atomic_set(&vec->count, 0); + if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) + goto cleanup; + } + + for_each_possible_cpu(i) + cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; + +cleanup: + for (i--; i >= 0; i--) + free_cpumask_var(cp->pri_to_cpu[i].mask); + return -ENOMEM; +} + +/** + * cpupri_cleanup - clean up the cpupri structure + * @cp: The cpupri context + */ +void cpupri_cleanup(struct cpupri *cp) +{ + int i; + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) + free_cpumask_var(cp->pri_to_cpu[i].mask); +} diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h new file mode 100644 index 000000000..f6d756173 --- /dev/null +++ b/kernel/sched/cpupri.h @@ -0,0 +1,34 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include <linux/sched.h> + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + atomic_t count; + cpumask_var_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, struct cpumask *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp); +void cpupri_cleanup(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c new file mode 100644 index 000000000..c23a8fd36 --- /dev/null +++ b/kernel/sched/cputime.c @@ -0,0 +1,854 @@ +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/tsacct_kern.h> +#include <linux/kernel_stat.h> +#include <linux/static_key.h> +#include <linux/context_tracking.h> +#include "sched.h" + + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in vtime_account, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/vtime_account on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +DEFINE_PER_CPU(u64, cpu_hardirq_time); +DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +DEFINE_PER_CPU(seqcount_t, irq_time_seq); +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void irqtime_account_irq(struct task_struct *curr) +{ + unsigned long flags; + s64 delta; + int cpu; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + __this_cpu_add(cpu_hardirq_time, delta); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + __this_cpu_add(cpu_softirq_time, delta); + + irq_time_write_end(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(irqtime_account_irq); + +static int irqtime_account_hi_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) + ret = 1; + local_irq_restore(flags); + return ret; +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime (0) + +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void task_group_account_field(struct task_struct *p, int index, + u64 tmp) +{ + /* + * Since all updates are sure to touch the root cgroup, we + * get ourselves ahead and touch it first. If the root cgroup + * is the only cgroup, then nothing else should be necessary. + * + */ + __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + + cpuacct_account_field(p, index, tmp); +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + int index; + + /* Add user time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + + index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for user time used */ + acct_account_cputime(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + /* Add guest time to process. */ + p->utime += cputime; + p->utimescaled += cputime_scaled; + account_group_user_time(p, cputime); + p->gtime += cputime; + + /* Add guest time to cpustat. */ + if (TASK_NICE(p) > 0) { + cpustat[CPUTIME_NICE] += (__force u64) cputime; + cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; + } else { + cpustat[CPUTIME_USER] += (__force u64) cputime; + cpustat[CPUTIME_GUEST] += (__force u64) cputime; + } +} + +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, int index) +{ + /* Add system time to process. */ + p->stime += cputime; + p->stimescaled += cputime_scaled; + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + task_group_account_field(p, index, (__force u64) cputime); + + /* Account for system time used */ + acct_account_cputime(p); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime, cputime_t cputime_scaled) +{ + int index; + + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime, cputime_scaled); + return; + } + + if (hardirq_count() - hardirq_offset) + index = CPUTIME_IRQ; + else if (in_serving_softirq()) + index = CPUTIME_SOFTIRQ; + else + index = CPUTIME_SYSTEM; + + __account_system_time(p, cputime, cputime_scaled, index); +} + +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + cpustat[CPUTIME_STEAL] += (__force u64) cputime; +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; + else + cpustat[CPUTIME_IDLE] += (__force u64) cputime; +} + +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT + if (static_key_false(¶virt_steal_enabled)) { + u64 steal, st = 0; + + steal = paravirt_steal_clock(smp_processor_id()); + steal -= this_rq()->prev_steal_time; + + st = steal_ticks(steal); + this_rq()->prev_steal_time += st * TICK_NSEC; + + account_steal_time(st); + return st; + } +#endif + return false; +} + +/* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + cputime_t utime, stime; + struct task_struct *t; + + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + do { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); +out: + rcu_read_unlock(); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq, int ticks) +{ + cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); + u64 cputime = (__force u64) cputime_one_jiffy; + u64 *cpustat = kcpustat_this_cpu->cpustat; + + if (steal_account_process_tick()) + return; + + cputime *= ticks; + scaled *= ticks; + + if (irqtime_account_hi_update()) { + cpustat[CPUTIME_IRQ] += cputime; + } else if (irqtime_account_si_update()) { + cpustat[CPUTIME_SOFTIRQ] += cputime; + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); + } else if (user_tick) { + account_user_time(p, cputime, scaled); + } else if (p == rq->idle) { + account_idle_time(cputime); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime, scaled); + } else { + __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + struct rq *rq = this_rq(); + + irqtime_account_process_tick(current, 0, rq, ticks); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static inline void irqtime_account_idle_ticks(int ticks) {} +static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq, int nr_ticks) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_task_switch(struct task_struct *prev) +{ + if (!vtime_accounting_enabled()) + return; + + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + vtime_account_user(prev); +#endif + arch_vtime_task_switch(prev); +} +#endif + +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + if (!in_interrupt()) { + /* + * If we interrupted user, context_tracking_in_user() + * is 1 because the context tracking don't hook + * on irq entry/exit. This way we know if + * we need to flush user time on kernel entry. + */ + if (context_tracking_in_user()) { + vtime_account_user(tsk); + return; + } + + if (is_idle_task(tsk)) { + vtime_account_idle(tsk); + return; + } + } + vtime_account_system(tsk); +} +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} + +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + struct rq *rq = this_rq(); + + if (vtime_accounting_enabled()) + return; + + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq, 1); + return; + } + + if (steal_account_process_tick()) + return; + + if (user_tick) + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, + one_jiffy_scaled); + else + account_idle_time(cputime_one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + + account_idle_time(jiffies_to_cputime(ticks)); +} + +/* + * Perform (stime * rtime) / total, but avoid multiplication overflow by + * loosing precision when the numbers are big. + */ +static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) +{ + u64 scaled; + + for (;;) { + /* Make sure "rtime" is the bigger of stime/rtime */ + if (stime > rtime) { + u64 tmp = rtime; rtime = stime; stime = tmp; + } + + /* Make sure 'total' fits in 32 bits */ + if (total >> 32) + goto drop_precision; + + /* Does rtime (and thus stime) fit in 32 bits? */ + if (!(rtime >> 32)) + break; + + /* Can we just balance rtime/stime rather than dropping bits? */ + if (stime >> 31) + goto drop_precision; + + /* We can grow stime and shrink rtime and try to make them both fit */ + stime <<= 1; + rtime >>= 1; + continue; + +drop_precision: + /* We drop from rtime, it has more bits than stime */ + rtime >>= 1; + total >>= 1; + } + + /* + * Make sure gcc understands that this is a 32x32->64 multiply, + * followed by a 64/32->64 divide. + */ + scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); + return (__force cputime_t) scaled; +} + +/* + * Adjust tick based cputime random precision against scheduler + * runtime accounting. + */ +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) +{ + cputime_t rtime, stime, utime; + + if (vtime_accounting_enabled()) { + *ut = curr->utime; + *st = curr->stime; + return; + } + + /* + * Tick based cputime accounting depend on random scheduling + * timeslices of a task to be interrupted or not by the timer. + * Depending on these circumstances, the number of these interrupts + * may be over or under-optimistic, matching the real user and system + * cputime with a variable precision. + * + * Fix this by scaling these tick based values against the total + * runtime accounted by the CFS scheduler. + */ + rtime = nsecs_to_cputime(curr->sum_exec_runtime); + + /* + * Update userspace visible utime/stime values only if actual execution + * time is bigger than already exported. Note that can happen, that we + * provided bigger values due to scaling inaccuracy on big numbers. + */ + if (prev->stime + prev->utime >= rtime) + goto out; + + stime = curr->stime; + utime = curr->utime; + + if (utime == 0) { + stime = rtime; + } else if (stime == 0) { + utime = rtime; + } else { + cputime_t total = stime + utime; + + stime = scale_stime((__force u64)stime, + (__force u64)rtime, (__force u64)total); + utime = rtime - stime; + } + + /* + * If the tick based count grows faster than the scheduler one, + * the result of the scaling may go backward. + * Let's enforce monotonicity. + */ + prev->stime = max(prev->stime, stime); + prev->utime = max(prev->utime, utime); + +out: + *ut = prev->utime; + *st = prev->stime; +} + +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + task_cputime(p, &cputime.utime, &cputime.stime); + cputime_adjust(&cputime, &p->prev_cputime, ut, st); +} + +/* + * Must be called with siglock held. + */ +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); +} +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +static unsigned long long vtime_delta(struct task_struct *tsk) +{ + unsigned long long clock; + + clock = local_clock(); + if (clock < tsk->vtime_snap) + return 0; + + return clock - tsk->vtime_snap; +} + +static cputime_t get_vtime_delta(struct task_struct *tsk) +{ + unsigned long long delta = vtime_delta(tsk); + + WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); + tsk->vtime_snap += delta; + + /* CHECKME: always safe to convert nsecs to cputime? */ + return nsecs_to_cputime(delta); +} + +static void __vtime_account_system(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(tsk); + + account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); +} + +void vtime_account_system(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_irq_exit(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + if (context_tracking_in_user()) + tsk->vtime_snap_whence = VTIME_USER; + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_user(struct task_struct *tsk) +{ + cputime_t delta_cpu; + + if (!vtime_accounting_enabled()) + return; + + delta_cpu = get_vtime_delta(tsk); + + write_seqlock(&tsk->vtime_seqlock); + tsk->vtime_snap_whence = VTIME_SYS; + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_user_enter(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + tsk->vtime_snap_whence = VTIME_USER; + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_enter(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags |= PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_exit(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags &= ~PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_idle(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(tsk); + + account_idle_time(delta_cpu); +} + +bool vtime_accounting_enabled(void) +{ + return context_tracking_active(); +} + +void arch_vtime_task_switch(struct task_struct *prev) +{ + write_seqlock(&prev->vtime_seqlock); + prev->vtime_snap_whence = VTIME_SLEEPING; + write_sequnlock(&prev->vtime_seqlock); + + write_seqlock(¤t->vtime_seqlock); + current->vtime_snap_whence = VTIME_SYS; + current->vtime_snap = sched_clock_cpu(smp_processor_id()); + write_sequnlock(¤t->vtime_seqlock); +} + +void vtime_init_idle(struct task_struct *t, int cpu) +{ + unsigned long flags; + + write_seqlock_irqsave(&t->vtime_seqlock, flags); + t->vtime_snap_whence = VTIME_SYS; + t->vtime_snap = sched_clock_cpu(cpu); + write_sequnlock_irqrestore(&t->vtime_seqlock, flags); +} + +cputime_t task_gtime(struct task_struct *t) +{ + unsigned int seq; + cputime_t gtime; + + do { + seq = read_seqbegin(&t->vtime_seqlock); + + gtime = t->gtime; + if (t->flags & PF_VCPU) + gtime += vtime_delta(t); + + } while (read_seqretry(&t->vtime_seqlock, seq)); + + return gtime; +} + +/* + * Fetch cputime raw values from fields of task_struct and + * add up the pending nohz execution time since the last + * cputime snapshot. + */ +static void +fetch_task_cputime(struct task_struct *t, + cputime_t *u_dst, cputime_t *s_dst, + cputime_t *u_src, cputime_t *s_src, + cputime_t *udelta, cputime_t *sdelta) +{ + unsigned int seq; + unsigned long long delta; + + do { + *udelta = 0; + *sdelta = 0; + + seq = read_seqbegin(&t->vtime_seqlock); + + if (u_dst) + *u_dst = *u_src; + if (s_dst) + *s_dst = *s_src; + + /* Task is sleeping, nothing to add */ + if (t->vtime_snap_whence == VTIME_SLEEPING || + is_idle_task(t)) + continue; + + delta = vtime_delta(t); + + /* + * Task runs either in user or kernel space, add pending nohz time to + * the right place. + */ + if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { + *udelta = delta; + } else { + if (t->vtime_snap_whence == VTIME_SYS) + *sdelta = delta; + } + } while (read_seqretry(&t->vtime_seqlock, seq)); +} + + +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utime, stime, &t->utime, + &t->stime, &udelta, &sdelta); + if (utime) + *utime += udelta; + if (stime) + *stime += sdelta; +} + +void task_cputime_scaled(struct task_struct *t, + cputime_t *utimescaled, cputime_t *stimescaled) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utimescaled, stimescaled, + &t->utimescaled, &t->stimescaled, &udelta, &sdelta); + if (utimescaled) + *utimescaled += cputime_to_scaled(udelta); + if (stimescaled) + *stimescaled += cputime_to_scaled(sdelta); +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c new file mode 100644 index 000000000..d0dd5ae8d --- /dev/null +++ b/kernel/sched/debug.c @@ -0,0 +1,1288 @@ +/* + * kernel/sched/debug.c + * + * Print the CFS rbtree + * + * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/utsname.h> +#ifdef CONFIG_KGDB_KDB +#include <linux/kdb.h> +#endif + +#include "sched.h" + +//#define TEST_SCHED_DEBUG_ENHANCEMENT +//#define MTK_SCHED_CMP_PRINT +#define TRYLOCK_NUM 10 +#include <linux/delay.h> +static DEFINE_SPINLOCK(sched_debug_lock); + +DECLARE_PER_CPU(u64, exec_delta_time); +DECLARE_PER_CPU(u64, clock_task); +DECLARE_PER_CPU(u64, exec_start); +DECLARE_PER_CPU(struct task_struct, exec_task); + +/* + * This allows printing both to /proc/sched_debug and + * to the console + */ +#ifndef CONFIG_KGDB_KDB +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) +#else +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else if (__get_cpu_var(kdb_in_use) == 1) \ + kdb_printf(x); \ + else \ + printk(x); \ + } while (0) +#endif +/* + * Ease the printing of nsec fields: + */ +static long long nsec_high(unsigned long long nsec) +{ + if ((long long)nsec < 0) { + nsec = -nsec; + do_div(nsec, 1000000); + return -nsec; + } + do_div(nsec, 1000000); + + return nsec; +} + +static unsigned long nsec_low(unsigned long long nsec) +{ + if ((long long)nsec < 0) + nsec = -nsec; + + return do_div(nsec, 1000000); +} + +#define SPLIT_NS(x) nsec_high(x), nsec_low(x) + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) +{ + struct sched_entity *se = tg->se[cpu]; + +#define P(F) \ + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define PN(F) \ + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + + if (!se) { + struct sched_avg *avg = &cpu_rq(cpu)->avg; + P(avg->runnable_avg_sum); + P(avg->runnable_avg_period); +#ifdef MTK_SCHED_CMP_PRINT +# ifdef CONFIG_MTK_SCHED_CMP + /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11 */ + P(avg->usage_avg_sum); + P(avg->load_avg_ratio); +# endif + P(avg->last_runnable_update); +#endif + return; + } + + + PN(se->exec_start); + PN(se->vruntime); + PN(se->sum_exec_runtime); +#ifdef CONFIG_SCHEDSTATS + PN(se->statistics.wait_start); + PN(se->statistics.sleep_start); + PN(se->statistics.block_start); + PN(se->statistics.sleep_max); + PN(se->statistics.block_max); + PN(se->statistics.exec_max); + PN(se->statistics.slice_max); + PN(se->statistics.wait_max); + PN(se->statistics.wait_sum); + P(se->statistics.wait_count); +#endif + P(se->load.weight); +#ifdef CONFIG_SMP + P(se->avg.runnable_avg_sum); + P(se->avg.runnable_avg_period); + P(se->avg.usage_avg_sum); + P(se->avg.load_avg_contrib); + P(se->avg.decay_count); + +# ifdef MTK_SCHED_CMP_PRINT +# ifdef CONFIG_MTK_SCHED_CMP + /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11 */ + P(se->avg.usage_avg_sum); + P(se->avg.load_avg_ratio); +# endif + P(se->avg.last_runnable_update); +# endif +#endif +#undef PN +#undef P +} +#endif + +#ifdef CONFIG_CGROUP_SCHED +static char group_path[PATH_MAX]; + +static char *task_group_path(struct task_group *tg) +{ + if (autogroup_path(tg, group_path, PATH_MAX)) + return group_path; + + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; +} +#endif + +static void +print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +{ + if (rq->curr == p) + SEQ_printf(m, "R"); + else + SEQ_printf(m, " "); + + SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + p->comm, p->pid, + SPLIT_NS(p->se.vruntime), + (long long)(p->nvcsw + p->nivcsw), + p->prio); +#ifdef CONFIG_SCHEDSTATS + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + SPLIT_NS(p->se.vruntime), + SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(p->se.statistics.sum_sleep_runtime)); +#else + SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", + 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); +#endif +#ifdef CONFIG_CGROUP_SCHED + SEQ_printf(m, " %s", task_group_path(task_group(p))); +#endif + + SEQ_printf(m, "\n"); +} + +static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) +{ + struct task_struct *g, *p; + unsigned long flags; + + SEQ_printf(m, + "\nrunnable tasks:\n" + " task PID tree-key switches prio" + " exec-runtime sum-exec sum-sleep\n" + "------------------------------------------------------" + "----------------------------------------------------\n"); + + read_lock_irqsave(&tasklist_lock, flags); + + do_each_thread(g, p) { + if (!p->on_rq || task_cpu(p) != rq_cpu) + continue; + + print_task(m, rq, p); + } while_each_thread(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); +} + +void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +{ + s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, + spread, rq0_min_vruntime, spread0; + struct rq *rq = cpu_rq(cpu); + struct sched_entity *last; + unsigned long flags; + +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); +#else + SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#endif + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", + SPLIT_NS(cfs_rq->exec_clock)); + + raw_spin_lock_irqsave(&rq->lock, flags); + if (cfs_rq->rb_leftmost) + MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; + last = __pick_last_entity(cfs_rq); + if (last) + max_vruntime = last->vruntime; + min_vruntime = cfs_rq->min_vruntime; + rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; + raw_spin_unlock_irqrestore(&rq->lock, flags); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", + SPLIT_NS(MIN_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", + SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", + SPLIT_NS(max_vruntime)); + spread = max_vruntime - MIN_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", + SPLIT_NS(spread)); + spread0 = min_vruntime - rq0_min_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", + SPLIT_NS(spread0)); + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", + cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); +#ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); +# ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", + cfs_rq->tg_load_contrib); + SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", + cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-30s: %ld\n", "tg->load_avg", + atomic_long_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", + atomic_read(&cfs_rq->tg->runnable_avg)); + SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg", + atomic_read(&cfs_rq->tg->usage_avg)); +# endif +# ifdef CONFIG_CFS_BANDWIDTH + SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", + cfs_rq->tg->cfs_bandwidth.timer_active); + SEQ_printf(m, " .%-30s: %d\n", "throttled", + cfs_rq->throttled); + SEQ_printf(m, " .%-30s: %d\n", "throttle_count", + cfs_rq->throttle_count); +# endif + +# ifdef CONFIG_FAIR_GROUP_SCHED + print_cfs_group_stats(m, cpu, cfs_rq->tg); +# endif +#endif +} + +void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) +{ +#ifdef CONFIG_RT_GROUP_SCHED + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); +#else + SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) + + P(rt_nr_running); + P(rt_throttled); + PN(rt_time); + PN(rt_runtime); + +#undef PN +#undef P +} + +extern __read_mostly int sched_clock_running; + +static void print_cpu(struct seq_file *m, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + +#ifdef CONFIG_X86 + { + unsigned int freq = cpu_khz ? : 1; + + SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", + cpu, freq / 1000, (freq % 1000)); + } +#else + SEQ_printf(m, "cpu#%d: %s\n", cpu, cpu_is_offline(cpu)?"Offline":"Online"); +#endif + +#define P(x) \ +do { \ + if (sizeof(rq->x) == 4) \ + SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ + else \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ +} while (0) + +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) + + P(nr_running); + SEQ_printf(m, " .%-30s: %lu\n", "load", + rq->load.weight); + P(nr_switches); + P(nr_load_updates); + P(nr_uninterruptible); + PN(next_balance); + P(curr->pid); + PN(clock); + P(cpu_load[0]); + P(cpu_load[1]); + P(cpu_load[2]); + P(cpu_load[3]); + P(cpu_load[4]); +#undef P +#undef PN + +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); + + P(yld_count); + + P(sched_count); + P(sched_goidle); +#ifdef CONFIG_SMP + P64(avg_idle); +#endif + + P(ttwu_count); + P(ttwu_local); + +#undef P +#undef P64 +#endif + spin_lock_irqsave(&sched_debug_lock, flags); + print_cfs_stats(m, cpu); + print_rt_stats(m, cpu); + + rcu_read_lock(); + print_rq(m, rq, cpu); + rcu_read_unlock(); + spin_unlock_irqrestore(&sched_debug_lock, flags); + SEQ_printf(m, "\n"); +} + +static const char *sched_tunable_scaling_names[] = { + "none", + "logaritmic", + "linear" +}; + +#ifdef TEST_SCHED_DEBUG_ENHANCEMENT +extern void lock_timekeeper(void); +#endif +static void sched_debug_header(struct seq_file *m) +{ + u64 ktime, sched_clk, cpu_clk; + unsigned long flags; + +#ifdef TEST_SCHED_DEBUG_ENHANCEMENT + static int i=0; + i++; + if(i==10){ + struct rq *rq = cpu_rq(0); + //lock_timekeeper(); + raw_spin_lock_irq(&rq->lock); + spin_lock_irqsave(&sched_debug_lock, flags); + write_lock_irqsave(&tasklist_lock, flags); + BUG_ON(1); + } +#endif + + local_irq_save(flags); + ktime = ktime_to_ns(ktime_get()); + sched_clk = sched_clock(); + cpu_clk = local_clock(); + local_irq_restore(flags); + + SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + +#define P(x) \ + SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(ktime); + PN(sched_clk); + PN(cpu_clk); + P(jiffies); +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + P(sched_clock_stable); +#endif +#undef PN +#undef P + + SEQ_printf(m, "\n"); + SEQ_printf(m, "sysctl_sched\n"); + +#define P(x) \ + SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(sysctl_sched_latency); + PN(sysctl_sched_min_granularity); + PN(sysctl_sched_wakeup_granularity); + P(sysctl_sched_child_runs_first); + P(sysctl_sched_features); +#undef PN +#undef P + + SEQ_printf(m, " .%-40s: %d (%s)\n", + "sysctl_sched_tunable_scaling", + sysctl_sched_tunable_scaling, + sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + SEQ_printf(m, "\n"); +} + +static int sched_debug_show(struct seq_file *m, void *v) +{ + int cpu = (unsigned long)(v - 2); + unsigned long flags; + + if (cpu != -1) { + read_lock_irqsave(&tasklist_lock, flags); + print_cpu(m, cpu); + read_unlock_irqrestore(&tasklist_lock, flags); + SEQ_printf(m, "\n"); + } else + sched_debug_header(m); + + return 0; +} + +void sysrq_sched_debug_show(void) +{ + int cpu; + unsigned long flags; + + sched_debug_header(NULL); + read_lock_irqsave(&tasklist_lock, flags); + //for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) + print_cpu(NULL, cpu); + read_unlock_irqrestore(&tasklist_lock, flags); + +} + +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *sched_debug_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; + + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return sched_debug_start(file, offset); +} + +static void sched_debug_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations sched_debug_sops = { + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, +}; + +static int sched_debug_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + + return 0; +} + +static int sched_debug_open(struct inode *inode, struct file *filp) +{ + int ret = 0; + + ret = seq_open(filp, &sched_debug_sops); + + return ret; +} + +static const struct file_operations sched_debug_fops = { + .open = sched_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = sched_debug_release, +}; + +static int __init init_sched_debug_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops); + if (!pe) + return -ENOMEM; + return 0; +} + +__initcall(init_sched_debug_procfs); + +void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +{ + unsigned long nr_switches; + + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, + get_nr_threads(p)); + SEQ_printf(m, + "---------------------------------------------------------\n"); +#define __P(F) \ + SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) +#define P(F) \ + SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) \ + SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) \ + SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + + PN(se.exec_start); + PN(se.vruntime); + PN(se.sum_exec_runtime); + + nr_switches = p->nvcsw + p->nivcsw; + +#ifdef CONFIG_SCHEDSTATS + PN(se.statistics.wait_start); + PN(se.statistics.sleep_start); + PN(se.statistics.block_start); + PN(se.statistics.sleep_max); + PN(se.statistics.block_max); + PN(se.statistics.exec_max); + PN(se.statistics.slice_max); + PN(se.statistics.wait_max); + PN(se.statistics.wait_sum); + P(se.statistics.wait_count); + PN(se.statistics.iowait_sum); + P(se.statistics.iowait_count); + P(se.nr_migrations); + P(se.statistics.nr_migrations_cold); + P(se.statistics.nr_failed_migrations_affine); + P(se.statistics.nr_failed_migrations_running); + P(se.statistics.nr_failed_migrations_hot); + P(se.statistics.nr_forced_migrations); + P(se.statistics.nr_wakeups); + P(se.statistics.nr_wakeups_sync); + P(se.statistics.nr_wakeups_migrate); + P(se.statistics.nr_wakeups_local); + P(se.statistics.nr_wakeups_remote); + P(se.statistics.nr_wakeups_affine); + P(se.statistics.nr_wakeups_affine_attempts); + P(se.statistics.nr_wakeups_passive); + P(se.statistics.nr_wakeups_idle); + + { + u64 avg_atom, avg_per_cpu; + + avg_atom = p->se.sum_exec_runtime; + if (nr_switches) + avg_atom = div64_ul(avg_atom, nr_switches); + else + avg_atom = -1LL; + + avg_per_cpu = p->se.sum_exec_runtime; + if (p->se.nr_migrations) { + avg_per_cpu = div64_u64(avg_per_cpu, + p->se.nr_migrations); + } else { + avg_per_cpu = -1LL; + } + + __PN(avg_atom); + __PN(avg_per_cpu); + } +#endif + __P(nr_switches); + SEQ_printf(m, "%-35s:%21Ld\n", + "nr_voluntary_switches", (long long)p->nvcsw); + SEQ_printf(m, "%-35s:%21Ld\n", + "nr_involuntary_switches", (long long)p->nivcsw); + + P(se.load.weight); +#ifdef CONFIG_SMP + P(se.avg.runnable_avg_sum); + P(se.avg.runnable_avg_period); + P(se.avg.load_avg_contrib); + P(se.avg.decay_count); + +# ifdef MTK_SCHED_CMP_PRINT +# ifdef CONFIG_MTK_SCHED_CMP + /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11 */ + P(se.avg.usage_avg_sum); + P(se.avg.load_avg_ratio); +# endif + P(se.avg.last_runnable_update); +# endif +#endif + P(policy); + P(prio); +#undef PN +#undef __PN +#undef P +#undef __P + + { + unsigned int this_cpu = raw_smp_processor_id(); + u64 t0, t1; + + t0 = cpu_clock(this_cpu); + t1 = cpu_clock(this_cpu); + SEQ_printf(m, "%-35s:%21Ld\n", + "clock-delta", (long long)(t1-t0)); + } +} + +void proc_sched_set_task(struct task_struct *p) +{ +#ifdef CONFIG_SCHEDSTATS + memset(&p->se.statistics, 0, sizeof(p->se.statistics)); +#endif +} + +#define read_trylock_irqsave(lock, flags) \ + ({ \ + typecheck(unsigned long, flags); \ + local_irq_save(flags); \ + read_trylock(lock)? \ + 1 : ({ local_irq_restore(flags); 0; }); \ + }) + +int read_trylock_n_irqsave(rwlock_t *lock, unsigned long *flags, struct seq_file *m, char *msg){ + int locked, trylock_cnt=0; + + do{ + locked = read_trylock_irqsave(lock, *flags); + trylock_cnt++; + mdelay(10); + }while((!locked) && (trylock_cnt < TRYLOCK_NUM)); + + if (!locked){ +#ifdef CONFIG_DEBUG_SPINLOCK + struct task_struct *owner = NULL; +#endif + SEQ_printf(m, "Warning: fail to get lock in %s\n", msg); +#ifdef CONFIG_DEBUG_SPINLOCK + if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT ) + owner = lock->owner; +# ifdef CONFIG_SMP + SEQ_printf(m, " lock: %p, .magic: %08x, .owner: %s/%d, " + ".owner_cpu: %d, value: %d\n", + lock, lock->magic, + owner ? owner-> comm: "<<none>>", + owner ? task_pid_nr(owner): -1, + lock->owner_cpu, lock->raw_lock.lock); +# else + SEQ_printf(m, " lock: %p, .magic: %08x, .owner: %s/%d, " + ".owner_cpu: %d\n", + lock, lock->magic, + owner ? owner-> comm: "<<none>>", + owner ? task_pid_nr(owner): -1, + lock->owner_cpu); +# endif +#endif + } + + return locked; +} + +int raw_spin_trylock_n_irqsave(raw_spinlock_t *lock, unsigned long *flags, struct seq_file *m, char *msg){ + int locked, trylock_cnt=0; + + do{ + locked = raw_spin_trylock_irqsave(lock, *flags); + trylock_cnt++; + mdelay(10); + }while((!locked) && (trylock_cnt < TRYLOCK_NUM)); + + if (!locked){ +#ifdef CONFIG_DEBUG_SPINLOCK + struct task_struct *owner = NULL; +#endif + SEQ_printf(m, "Warning: fail to get lock in %s\n", msg); +#ifdef CONFIG_DEBUG_SPINLOCK + if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT ) + owner = lock->owner; +# ifdef CONFIG_ARM64 + SEQ_printf(m, " lock: %lx, .magic: %08x, .owner: %s/%d, " + ".owner_cpu: %d, value: %d\n", + (long)lock, lock->magic, + owner ? owner-> comm: "<<none>>", + owner ? task_pid_nr(owner): -1, + lock->owner_cpu, + #ifdef CONFIG_SMP + lock->raw_lock.lock); + #else + lock->raw_lock.slock); + #endif +# else + SEQ_printf(m, " lock: %x, .magic: %08x, .owner: %s/%d, " + ".owner_cpu: %d, value: %d\n", + (int)lock, lock->magic, + owner ? owner-> comm: "<<none>>", + owner ? task_pid_nr(owner): -1, + lock->owner_cpu, lock->raw_lock.slock); +# endif +#endif + } + + return locked; +} + +int spin_trylock_n_irqsave(spinlock_t *lock, unsigned long *flags, struct seq_file *m, char *msg){ + int locked, trylock_cnt=0; + + do{ + locked = spin_trylock_irqsave(lock, *flags); + trylock_cnt++; + mdelay(10); + + }while((!locked) && (trylock_cnt < TRYLOCK_NUM)); + + if (!locked){ +#ifdef CONFIG_DEBUG_SPINLOCK + raw_spinlock_t rlock = lock->rlock; + struct task_struct *owner = NULL; +#endif + SEQ_printf(m, "Warning: fail to get lock in %s\n", msg); +#ifdef CONFIG_DEBUG_SPINLOCK + if (rlock.owner && rlock.owner != SPINLOCK_OWNER_INIT ) + owner = rlock.owner; +# ifdef CONFIG_ARM64 + SEQ_printf(m, " lock: %lx, .magic: %08x, .owner: %s/%d, " + ".owner_cpu: %d, value: %d\n", + (long) &rlock, rlock.magic, + owner ? owner-> comm: "<<none>>", + owner ? task_pid_nr(owner): -1, + rlock.owner_cpu, + #ifdef CONFIG_SMP + rlock.raw_lock.lock); + #else + rlock.raw_lock.slock); + #endif +# else + SEQ_printf(m, " lock: %x, .magic: %08x, .owner: %s/%d, " + ".owner_cpu: %d, value: %d\n", + (int) &rlock, rlock.magic, + owner ? owner-> comm: "<<none>>", + owner ? task_pid_nr(owner): -1, + rlock.owner_cpu, rlock.raw_lock.slock); +# endif +#endif + } + + return locked; +} + +void print_rq_at_KE(struct seq_file *m, struct rq *rq, int rq_cpu) +{ + struct task_struct *g, *p; + unsigned long flags; + int locked; + + SEQ_printf(m, + "runnable tasks:\n" + " task PID tree-key switches prio" + " exec-runtime sum-exec sum-sleep\n" + "------------------------------------------------------" + "----------------------------------------------------\n"); + + //read_lock_irqsave(&tasklist_lock, flags); + locked = read_trylock_n_irqsave(&tasklist_lock, &flags, m, "print_rq_at_KE"); + + do_each_thread(g, p) { + if (!p->on_rq || task_cpu(p) != rq_cpu) + continue; + + print_task(m, rq, p); + } while_each_thread(g, p); + + if (locked) + read_unlock_irqrestore(&tasklist_lock, flags); +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void print_cfs_group_stats_at_KE(struct seq_file *m, int cpu, struct task_group *tg) +{ + struct sched_entity *se = tg->se[cpu]; + +#define P(F) \ + SEQ_printf(m, " .%-22s: %lld\n", #F, (long long)F) +#define PN(F) \ + SEQ_printf(m, " .%-22s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + + if (!se) { + struct sched_avg *avg = &cpu_rq(cpu)->avg; + P(avg->runnable_avg_sum); + P(avg->runnable_avg_period); +#ifdef MTK_SCHED_CMP_PRINT +# ifdef CONFIG_MTK_SCHED_CMP + /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11 */ + P(avg->usage_avg_sum); + P(avg->load_avg_ratio); +# endif + P(avg->last_runnable_update); +#endif + return; + } + + + PN(se->exec_start); + PN(se->vruntime); + PN(se->sum_exec_runtime); + P(se->load.weight); +#ifdef CONFIG_SMP + P(se->avg.runnable_avg_sum); + P(se->avg.runnable_avg_period); + P(se->avg.usage_avg_sum); + P(se->avg.load_avg_contrib); + P(se->avg.decay_count); + +# ifdef MTK_SCHED_CMP_PRINT +# ifdef CONFIG_MTK_SCHED_CMP + /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11 */ + P(se->avg.usage_avg_sum); + P(se->avg.load_avg_ratio); +# endif + P(se->avg.last_runnable_update); +# endif +#endif +#undef PN +#undef P +} +#endif + +void print_cfs_rq_at_KE(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +{ + s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, + spread, rq0_min_vruntime, spread0; + struct rq *rq = cpu_rq(cpu); + struct sched_entity *last; + unsigned long flags; + int locked; + +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); +#else + SEQ_printf(m, "cfs_rq[%d]:\n", cpu); +#endif + SEQ_printf(m, " .%-22s: %Ld.%06ld\n", "exec_clock", + SPLIT_NS(cfs_rq->exec_clock)); + + //raw_spin_lock_irqsave(&rq->lock, flags); + locked = raw_spin_trylock_n_irqsave(&rq->lock, &flags, m, "print_cfs_rq_at_KE"); + if (cfs_rq->rb_leftmost) + MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; + last = __pick_last_entity(cfs_rq); + if (last) + max_vruntime = last->vruntime; + min_vruntime = cfs_rq->min_vruntime; + rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; + if(locked) + raw_spin_unlock_irqrestore(&rq->lock, flags); + SEQ_printf(m, " .%-22s: %Ld.%06ld\n", "MIN_vruntime", + SPLIT_NS(MIN_vruntime)); + SEQ_printf(m, " .%-22s: %Ld.%06ld\n", "min_vruntime", + SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-22s: %Ld.%06ld\n", "max_vruntime", + SPLIT_NS(max_vruntime)); + spread = max_vruntime - MIN_vruntime; + SEQ_printf(m, " .%-22s: %Ld.%06ld\n", "spread", + SPLIT_NS(spread)); + spread0 = min_vruntime - rq0_min_vruntime; + SEQ_printf(m, " .%-22s: %Ld.%06ld\n", "spread0", + SPLIT_NS(spread0)); + SEQ_printf(m, " .%-22s: %d\n", "nr_spread_over", + cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-22s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-22s: %ld\n", "load", cfs_rq->load.weight); +#ifdef CONFIG_SMP + SEQ_printf(m, " .%-22s: %ld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-22s: %ld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); +# ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, " .%-22s: %ld\n", "tg_load_contrib", + cfs_rq->tg_load_contrib); + SEQ_printf(m, " .%-22s: %d\n", "tg_runnable_contrib", + cfs_rq->tg_runnable_contrib); + SEQ_printf(m, " .%-22s: %ld\n", "tg->load_avg", + atomic_long_read(&cfs_rq->tg->load_avg)); + SEQ_printf(m, " .%-22s: %d\n", "tg->runnable_avg", + atomic_read(&cfs_rq->tg->runnable_avg)); +# endif +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + print_cfs_group_stats_at_KE(m, cpu, cfs_rq->tg); +#endif +} + +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + +void print_cfs_stats_at_KE(struct seq_file *m, int cpu) +{ + struct cfs_rq *cfs_rq; + + rcu_read_lock(); + for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + print_cfs_rq_at_KE(m, cpu, cfs_rq); + rcu_read_unlock(); +} + +void print_rt_rq_at_KE(struct seq_file *m, int cpu, struct rt_rq *rt_rq) +{ +#ifdef CONFIG_RT_GROUP_SCHED + SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); +#else + SEQ_printf(m, "rt_rq[%d]:\n", cpu); +#endif + +#define P(x) \ + SEQ_printf(m, " .%-22s: %Ld\n", #x, (long long)(rt_rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-22s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) + + P(rt_nr_running); + P(rt_throttled); + SEQ_printf(m, " exec_task[%d:%s], prio=%d exec_delta_time[%llu]" + ", clock_task[%llu], exec_start[%llu]\n", + per_cpu(exec_task, cpu).pid, + per_cpu(exec_task, cpu).comm, + per_cpu(exec_task, cpu).prio, + per_cpu(exec_delta_time, cpu), + per_cpu(clock_task, cpu), + per_cpu(exec_start, cpu)); + PN(rt_time); + PN(rt_runtime); + +#undef PN +#undef P +} + +#ifdef CONFIG_RT_GROUP_SCHED +typedef struct task_group *rt_rq_iter_t; + +static inline struct task_group *next_task_group(struct task_group *tg) +{ + do { + tg = list_entry_rcu(tg->list.next, + typeof(struct task_group), list); + } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); + + if (&tg->list == &task_groups) + tg = NULL; + + return tg; +} + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for (iter = container_of(&task_groups, typeof(*iter), list); \ + (iter = next_task_group(iter)) && \ + (rt_rq = iter->rt_rq[cpu_of(rq)]);) + +#else /* !CONFIG_RT_GROUP_SCHED */ + +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +#endif + +void print_rt_stats_at_KE(struct seq_file *m, int cpu) +{ + rt_rq_iter_t iter; + struct rt_rq *rt_rq; + + rcu_read_lock(); + for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) + print_rt_rq_at_KE(m, cpu, rt_rq); + rcu_read_unlock(); +} + +static void print_cpu_at_KE(struct seq_file *m, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + int locked; + +#ifdef CONFIG_X86 + { + unsigned int freq = cpu_khz ? : 1; + + SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + cpu, freq / 1000, (freq % 1000)); + } +#else + SEQ_printf(m, "cpu#%d: %s\n", cpu, cpu_is_offline(cpu)?"Offline":"Online"); +#endif + +#define P(x) \ +do { \ + if (sizeof(rq->x) == 4) \ + SEQ_printf(m, " .%-22s: %ld\n", #x, (long)(rq->x)); \ + else \ + SEQ_printf(m, " .%-22s: %Ld\n", #x, (long long)(rq->x));\ +} while (0) + +#define PN(x) \ + SEQ_printf(m, " .%-22s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) + + P(nr_running); + SEQ_printf(m, " .%-22s: %lu\n", "load", + rq->load.weight); + P(nr_switches); + P(nr_load_updates); + P(nr_uninterruptible); + PN(next_balance); + P(curr->pid); + PN(clock); + P(cpu_load[0]); + P(cpu_load[1]); + P(cpu_load[2]); + P(cpu_load[3]); + P(cpu_load[4]); +#undef P +#undef PN + +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-22s: %d\n", #n, rq->n); +#define P64(n) SEQ_printf(m, " .%-22s: %Ld\n", #n, rq->n); + + P(yld_count); + + P(sched_count); + P(sched_goidle); +#ifdef CONFIG_SMP + P64(avg_idle); +#endif + + P(ttwu_count); + P(ttwu_local); + +#undef P +#undef P64 +#endif + //spin_lock_irqsave(&sched_debug_lock, flags); + locked = spin_trylock_n_irqsave( &sched_debug_lock, &flags, m, "print_cpu_at_KE"); + print_cfs_stats_at_KE(m, cpu); + print_rt_stats_at_KE(m, cpu); + + rcu_read_lock(); + print_rq_at_KE(m, rq, cpu); + SEQ_printf(m, + "======================================================" + "====================================================\n"); + rcu_read_unlock(); + if (locked) + spin_unlock_irqrestore(&sched_debug_lock, flags); +} + +static void sched_debug_header_at_KE(struct seq_file *m) +{ + u64 ktime=0, sched_clk, cpu_clk; + unsigned long flags; + + local_irq_save(flags); + // ktime = ktime_to_ns(ktime_get()); + sched_clk = sched_clock(); + cpu_clk = local_clock(); + local_irq_restore(flags); + + SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + +#define P(x) \ + SEQ_printf(m, "%-22s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, "%-22s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(ktime); + PN(sched_clk); + PN(cpu_clk); + P(jiffies); +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + P(sched_clock_stable); +#endif +#undef PN +#undef P + + //SEQ_printf(m, "\n"); + SEQ_printf(m, "sysctl_sched\n"); + +#define P(x) \ + SEQ_printf(m, " .%-35s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, " .%-35s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(sysctl_sched_latency); + PN(sysctl_sched_min_granularity); + PN(sysctl_sched_wakeup_granularity); + P(sysctl_sched_child_runs_first); + P(sysctl_sched_features); +#undef PN +#undef P + + SEQ_printf(m, " .%-35s: %d (%s)\n", + "sysctl_sched_tunable_scaling", + sysctl_sched_tunable_scaling, + sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + SEQ_printf(m, "\n"); +} + +void sysrq_sched_debug_show_at_KE(void) +{ + int cpu; + unsigned long flags; + int locked; + + sched_debug_header_at_KE(NULL); + //read_lock_irqsave(&tasklist_lock, flags); + locked = read_trylock_n_irqsave(&tasklist_lock, &flags, NULL, "sched_debug_show_at_KE"); + //for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) + print_cpu_at_KE(NULL, cpu); + if (locked) + read_unlock_irqrestore(&tasklist_lock, flags); + +} + +#ifdef CONFIG_MET_SCHED_HMP +/* MET */ +#include <linux/export.h> +#include <linux/met_drv.h> + +static char header[] = +"met-info [000] 0.0: ms_ud_sys_header: TaskTh,B->th,L->th,d,d\n" +"met-info [000] 0.0: ms_ud_sys_header: HmpStat,force_up,force_down,d,d\n" +"met-info [000] 0.0: ms_ud_sys_header: HmpLoad,big_load_avg,little_load_avg,d,d\n" +"met-info [000] 0.0: ms_ud_sys_header: RqLen,rq0,rq1,rq2,rq3,d,d,d,d\n" +"met-info [000] 0.0: ms_ud_sys_header: CfsLen,cfs_rq0,cfs_rq1,cfs_rq2,cfs_rq3,d,d,d,d\n" +"met-info [000] 0.0: ms_ud_sys_header: RtLen,rt_rq0,rt_rq1,rt_rq2,rt_rq3,d,d,d,d\n"; + +static char help[] = " --met_hmp_cfs monitor hmp_cfs\n"; +static int sample_print_help(char *buf, int len) +{ + return snprintf(buf, PAGE_SIZE, help); +} + +static int sample_print_header(char *buf, int len) +{ + return snprintf(buf, PAGE_SIZE, header); +} + +unsigned int mt_cfs_dbg=0; +static void sample_start(void) +{ + mt_cfs_dbg=1; + return; +} + +static void sample_stop(void) +{ + mt_cfs_dbg=0; + return; +} + +struct metdevice met_hmp_cfs = { + .name = "hmp_cfs", + .owner = THIS_MODULE, + .type = MET_TYPE_BUS, + .start = sample_start, + .stop = sample_stop, + .print_help = sample_print_help, + .print_header = sample_print_header, +}; +EXPORT_SYMBOL(met_hmp_cfs); + +void TaskTh(unsigned int B_th,unsigned int L_th){ +if(mt_cfs_dbg) +trace_printk("%d,%d\n",B_th,L_th); +} + +void HmpStat(struct hmp_statisic *hmp_stats){ +if(mt_cfs_dbg) +trace_printk("%d,%d\n",hmp_stats->nr_force_up,hmp_stats->nr_force_down); +} + +void HmpLoad(int big_load_avg, int little_load_avg){ +if(mt_cfs_dbg) +trace_printk("%d,%d\n",big_load_avg,little_load_avg); +} + +static DEFINE_PER_CPU(unsigned int, cfsrqCnt); +static DEFINE_PER_CPU(unsigned int, rtrqCnt); +static DEFINE_PER_CPU(unsigned int, rqCnt); + +void RqLen(int cpu, int length){ +if(mt_cfs_dbg){ + per_cpu(rqCnt, cpu) = length; + #if NR_CPUS == 4 + trace_printk("%d,%d,%d,%d\n",per_cpu(rqCnt,0),per_cpu(rqCnt,1),per_cpu(rqCnt,2),per_cpu(rqCnt,3)); + #endif + } +} + +void CfsLen(int cpu, int length){ +if(mt_cfs_dbg){ + per_cpu(cfsrqCnt, cpu) = length; + #if NR_CPUS == 4 + trace_printk("%d,%d,%d,%d\n",per_cpu(cfsrqCnt,0),per_cpu(cfsrqCnt,1),per_cpu(cfsrqCnt,2),per_cpu(cfsrqCnt,3)); + #endif + } +} + +void RtLen(int cpu, int length){ +if(mt_cfs_dbg){ + per_cpu(rtrqCnt, cpu) = length; + #if NR_CPUS == 4 + trace_printk("%d,%d,%d,%d\n",per_cpu(rtrqCnt,0),per_cpu(rtrqCnt,1),per_cpu(rtrqCnt,2),per_cpu(rtrqCnt,3)); + #endif + } +} +#endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c new file mode 100644 index 000000000..e3120c0e2 --- /dev/null +++ b/kernel/sched/fair.c @@ -0,0 +1,10321 @@ +/* + * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) + * + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Interactivity improvements by Mike Galbraith + * (C) 2007 Mike Galbraith <efault@gmx.de> + * + * Various enhancements by Dmitry Adamushko. + * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> + * + * Group scheduling enhancements by Srivatsa Vaddagiri + * Copyright IBM Corporation, 2007 + * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> + * + * Scaled math optimizations by Thomas Gleixner + * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + */ + +#include <linux/latencytop.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/slab.h> +#include <linux/profile.h> +#include <linux/interrupt.h> +#include <linux/mempolicy.h> +#include <linux/migrate.h> +#include <linux/task_work.h> + +#include <trace/events/sched.h> +#ifdef CONFIG_HMP_VARIABLE_SCALE +#include <linux/sysfs.h> +#include <linux/vmalloc.h> +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* Include cpufreq header to add a notifier so that cpu frequency + * scaling can track the current CPU frequency + */ +#include <linux/cpufreq.h> +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#endif /* CONFIG_HMP_VARIABLE_SCALE */ + +#include "sched.h" + +#include <mtlbprof/mtlbprof.h> + + +#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT +#ifdef CONFIG_LOCAL_TIMERS +unsigned long localtimer_get_counter(void); +#endif +#endif + +#ifdef CONFIG_HEVTASK_INTERFACE +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#ifdef CONFIG_KGDB_KDB +#include <linux/kdb.h> +#endif +#endif + +/* + * Targeted preemption latency for CPU-bound tasks: + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + * + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length + * and have no persistent notion like in traditional, time-slice + * based scheduling concepts. + * + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches (cs) field) + */ +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; + +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling + = SCHED_TUNABLESCALING_LOG; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; + +/* + * is kept at sysctl_sched_latency / sysctl_sched_min_granularity + */ +static unsigned int sched_nr_latency = 8; + +/* + * After fork, child runs first. If set to 0 (default) then + * parent will (try to) run first. + */ +unsigned int sysctl_sched_child_runs_first __read_mostly; + +/* + * SCHED_OTHER wake-up granularity. + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + * + * This option delays the preemption effects of decoupled workloads + * and reduces their over-scheduling. Synchronous workloads will still + * have immediate wakeup/sleep latencies. + */ +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + +const_debug unsigned int sysctl_sched_migration_cost = 33000UL; + +/* + * The exponential sliding window over which load is averaged for shares + * distribution. + * (default: 10msec) + */ +unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; + +#ifdef CONFIG_CFS_BANDWIDTH +/* + * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool + * each time a cfs_rq requests quota. + * + * Note: in the case that the slice exceeds the runtime remaining (either due + * to consumption or the quota being specified to be smaller than the slice) + * we will always only issue the remaining available time. + * + * default: 5 msec, units: microseconds + */ +unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif +#if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE) +static int need_lazy_balance(int dst_cpu, int src_cpu, struct task_struct *p); +#endif + +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static int get_update_sysctl_factor(void) +{ + unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } + + return factor; +} + +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); +#undef SET_SYSCTL +} + +void sched_init_granularity(void) +{ + update_sysctl(); +} +#if defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK) || defined (CONFIG_HMP_PACK_SMALL_TASK) +/* + * Save the id of the optimal CPU that should be used to pack small tasks + * The value -1 is used when no buddy has been found + */ +DEFINE_PER_CPU(int, sd_pack_buddy) = {-1}; + +#ifdef CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK +struct cpumask buddy_cpu_map = {{0}}; +#endif + +/* Look for the best buddy CPU that can be used to pack small tasks + * We make the assumption that it doesn't wort to pack on CPU that share the + * same powerline. We looks for the 1st sched_domain without the + * SD_SHARE_POWERLINE flag. Then We look for the sched_group witht the lowest + * power per core based on the assumption that their power efficiency is + * better */ +void update_packing_domain(int cpu) +{ + struct sched_domain *sd; + int id = -1; + +#ifdef CONFIG_HMP_PACK_BUDDY_INFO + pr_info("[PACK] update_packing_domain() CPU%d\n", cpu); +#endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */ + mt_sched_printf(sched_pa, "[PACK] update_packing_domain() CPU%d", cpu); + + sd = highest_flag_domain(cpu, SD_SHARE_POWERLINE); + if (!sd) + { + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + } + else + if (cpumask_first(sched_domain_span(sd)) == cpu || !sd->parent) + sd = sd->parent; + + while (sd) { + struct sched_group *sg = sd->groups; + struct sched_group *pack = sg; + struct sched_group *tmp = sg->next; + +#ifdef CONFIG_HMP_PACK_BUDDY_INFO + pr_info("[PACK] sd = 0x%08x, flags = %d\n", (unsigned int)sd, sd->flags); +#endif /* CONFIG_HMP_PACK_BUDDY_INFO */ + +#ifdef CONFIG_HMP_PACK_BUDDY_INFO + pr_info("[PACK] sg = 0x%08x\n", (unsigned int)sg); +#endif /* CONFIG_HMP_PACK_BUDDY_INFO */ + + /* 1st CPU of the sched domain is a good candidate */ + if (id == -1) + id = cpumask_first(sched_domain_span(sd)); + +#ifdef CONFIG_HMP_PACK_BUDDY_INFO + pr_info("[PACK] First cpu in this sd id = %d\n", id); +#endif /* CONFIG_HMP_PACK_BUDDY_INFO */ + + /* Find sched group of candidate */ + tmp = sd->groups; + do { + if (cpumask_test_cpu(id, sched_group_cpus(tmp))) { + sg = tmp; + break; + } + } while (tmp = tmp->next, tmp != sd->groups); + +#ifdef CONFIG_HMP_PACK_BUDDY_INFO + pr_info("[PACK] pack = 0x%08x\n", (unsigned int)sg); +#endif /* CONFIG_HMP_PACK_BUDDY_INFO */ + + pack = sg; + tmp = sg->next; + + /* loop the sched groups to find the best one */ + //Stop find the best one in the same Load Balance Domain + //while (tmp != sg) { + while (tmp != sg && !(sd->flags & SD_LOAD_BALANCE)) { + if (tmp->sgp->power * sg->group_weight < + sg->sgp->power * tmp->group_weight) { + +#ifdef CONFIG_HMP_PACK_BUDDY_INFO + pr_info("[PACK] Now sg power = %u, weight = %u, mask = %lu\n", sg->sgp->power, sg->group_weight, sg->cpumask[0]); + pr_info("[PACK] Better sg power = %u, weight = %u, mask = %lu\n", tmp->sgp->power, tmp->group_weight, tmp->cpumask[0]); +#endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */ + + pack = tmp; + } + tmp = tmp->next; + } + + /* we have found a better group */ + if (pack != sg) { + id = cpumask_first(sched_group_cpus(pack)); + +#ifdef CONFIG_HMP_PACK_BUDDY_INFO + pr_info("[PACK] Better sg, first cpu id = %d\n", id); +#endif /* CONFIG_HMP_PACK_BUDDY_INFO */ + + } + +#ifdef CONFIG_HMP_PACK_BUDDY_INFO + if(sd->parent) { + pr_info("[PACK] cpu = %d, id = %d, sd->parent = 0x%08x, flags = %d, SD_LOAD_BALANCE = %d\n", cpu, id, (unsigned int)sd->parent, sd->parent->flags, SD_LOAD_BALANCE); + pr_info("[PACK] %d\n", (id != cpu)); + pr_info("[PACK] 0x%08x\n", (unsigned int)(sd->parent)); + pr_info("[PACK] %d\n", (sd->parent->flags & SD_LOAD_BALANCE)); + } + else { + pr_info("[PACK] cpu = %d, id = %d, sd->parent = 0x%08x\n", cpu, id, (unsigned int)sd->parent); + } +#endif /* CONFIG_HMP_PACK_BUDDY_INFO */ + + + /* Look for another CPU than itself */ + if ((id != cpu) || + ((sd->parent) && (sd->parent->flags & SD_LOAD_BALANCE))) { + +#ifdef CONFIG_HMP_PACK_BUDDY_INFO + pr_info("[PACK] Break\n"); +#endif /*CONFIG_HMP_PACK_BUDDY_INFO */ + + break; + } + sd = sd->parent; + } + +#ifdef CONFIG_HMP_PACK_BUDDY_INFO + pr_info("[PACK] CPU%d packing on CPU%d\n", cpu, id); +#endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */ + mt_sched_printf(sched_pa, "[PACK] CPU%d packing on CPU%d", cpu, id); + +#ifdef CONFIG_HMP_PACK_SMALL_TASK + per_cpu(sd_pack_buddy, cpu) = id; +#else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */ + if(per_cpu(sd_pack_buddy, cpu) != -1) + cpu_clear(per_cpu(sd_pack_buddy, cpu), buddy_cpu_map); + per_cpu(sd_pack_buddy, cpu) = id; + if(id != -1) + cpumask_set_cpu(id, &buddy_cpu_map); +#endif +} + +#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER +DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_USAGE); +DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_PERIOD); +DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_NR); +DEFINE_PER_CPU(u32, TASK_USGAE); +DEFINE_PER_CPU(u32, TASK_PERIOD); +u32 PACK_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS]; +u32 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS]; +u32 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS]; +u32 TASK_PACK_CPU_COUNT[4][NR_CPUS] = {{0}}; +u32 PA_ENABLE = 1; +u32 PA_MON_ENABLE = 0; +char PA_MON[4][TASK_COMM_LEN]={{0}}; +#endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER */ + +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER +DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_USAGE); +DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_PERIOD); +DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_NR); +DEFINE_PER_CPU(u32, TASK_USGAE); +DEFINE_PER_CPU(u32, TASK_PERIOD); +u32 PACK_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS]; +u32 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS]; +u32 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS]; +u32 HMP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS]; +u32 PA_ENABLE = 1; +u32 LB_ENABLE = 1; +u32 PA_MON_ENABLE = 0; +char PA_MON[TASK_COMM_LEN]; + +#ifdef CONFIG_HMP_TRACER +#define POWER_AWARE_ACTIVE_MODULE_PACK_FORM_CPUX_TO_CPUY (0) +#define POWER_AWARE_ACTIVE_MODULE_AVOID_WAKE_UP_FORM_CPUX_TO_CPUY (1) +#define POWER_AWARE_ACTIVE_MODULE_AVOID_BALANCE_FORM_CPUX_TO_CPUY (2) +#define POWER_AWARE_ACTIVE_MODULE_AVOID_FORCE_UP_FORM_CPUX_TO_CPUY (3) +#endif /* CONFIG_HMP_TRACER */ + +#endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER */ + + +static inline bool is_buddy_busy(int cpu) +{ +#ifdef CONFIG_HMP_PACK_SMALL_TASK + struct rq *rq; + + if (cpu < 0) + return 0; + + rq = cpu_rq(cpu); +#else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */ + struct rq *rq = cpu_rq(cpu); +#endif + /* + * A busy buddy is a CPU with a high load or a small load with a lot of + * running tasks. + */ + +#if defined (CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER) + per_cpu(BUDDY_CPU_RQ_USAGE, cpu) = rq->avg.usage_avg_sum; + per_cpu(BUDDY_CPU_RQ_PERIOD, cpu) = rq->avg.runnable_avg_period; + per_cpu(BUDDY_CPU_RQ_NR, cpu) = rq->nr_running; +#endif /*(CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER) */ + + return ((rq->avg.usage_avg_sum << rq->nr_running) > + rq->avg.runnable_avg_period); + +} + +static inline bool is_light_task(struct task_struct *p) +{ +#if defined (CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER) + per_cpu(TASK_USGAE, task_cpu(p)) = p->se.avg.usage_avg_sum; + per_cpu(TASK_PERIOD, task_cpu(p)) = p->se.avg.runnable_avg_period; +#endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER || CONFIG_HMP_POWER_AWARE_CONTROLLER*/ + + /* A light task runs less than 25% in average */ + return ((p->se.avg.usage_avg_sum << 2) < p->se.avg.runnable_avg_period); +} + + +static int check_pack_buddy(int cpu, struct task_struct *p) +{ +#ifdef CONFIG_HMP_PACK_SMALL_TASK + int buddy; + + if(cpu >= NR_CPUS || cpu < 0) + return false; + buddy = per_cpu(sd_pack_buddy, cpu); +#else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */ + int buddy = cpu; +#endif + + /* No pack buddy for this CPU */ + if (buddy == -1) + return false; + + /* + * If a task is waiting for running on the CPU which is its own buddy, + * let the default behavior to look for a better CPU if available + * The threshold has been set to 37.5% + */ +#ifdef CONFIG_HMP_PACK_SMALL_TASK + if ((buddy == cpu) + && ((p->se.avg.usage_avg_sum << 3) < (p->se.avg.runnable_avg_sum * 5))) + return false; +#endif + + /* buddy is not an allowed CPU */ + if (!cpumask_test_cpu(buddy, tsk_cpus_allowed(p))) + return false; + + /* + * If the task is a small one and the buddy is not overloaded, + * we use buddy cpu + */ + if (!is_light_task(p) || is_buddy_busy(buddy)) + return false; + + return true; +} +#endif /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK || CONFIG_HMP_PACK_SMALL_TASK*/ + +#if BITS_PER_LONG == 32 +# define WMULT_CONST (~0UL) +#else +# define WMULT_CONST (1UL << 32) +#endif + +#define WMULT_SHIFT 32 + +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +/* + * delta *= weight / lw + */ +static unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, + struct load_weight *lw) +{ + u64 tmp; + + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; + + if (!lw->inv_weight) { + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; + } + + /* + * Check whether we'd overflow the 64-bit multiplication: + */ + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); +} + + +const struct sched_class fair_sched_class; + +/************************************************************** + * CFS operations on generic schedulable entities: + */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* cpu runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) + +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + +/* Walk up scheduling entities hierarchy */ +#define for_each_sched_entity(se) \ + for (; se; se = se->parent) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update); + +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (!cfs_rq->on_list) { + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases. + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } else { + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } + + cfs_rq->on_list = 1; + /* We should have no load, but we need to update last_decay. */ + update_cfs_rq_blocked_load(cfs_rq, 0); + } +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->on_list) { + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + cfs_rq->on_list = 0; + } +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + if (se && pse) + { + if (se->cfs_rq == pse->cfs_rq) + return 1; + } + + return 0; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return se->parent; +} + +/* return depth at which a sched entity is present in the hierarchy */ +static inline int depth_se(struct sched_entity *se) +{ + int depth = 0; + + for_each_sched_entity(se) + depth++; + + return depth; +} + +static void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ + int se_depth, pse_depth; + + /* + * preemption test can be made between sibling entities who are in the + * same cfs_rq i.e who have a common parent. Walk up the hierarchy of + * both tasks until we find their ancestors who are siblings of common + * parent. + */ + + /* First walk up until both entities are at same depth */ + se_depth = depth_se(*se); + pse_depth = depth_se(*pse); + + while (se_depth > pse_depth) { + se_depth--; + *se = parent_entity(*se); + } + + while (pse_depth > se_depth) { + pse_depth--; + *pse = parent_entity(*pse); + } + + while (!is_same_group(*se, *pse)) { + *se = parent_entity(*se); + *pse = parent_entity(*pse); + } +} + +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} + +#define entity_is_task(se) 1 + +#define for_each_sched_entity(se) \ + for (; se; se = NULL) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} + +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) + +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + return 1; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return NULL; +} + +static inline void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); + +/************************************************************** + * Scheduling class tree data structure manipulation methods: + */ + +static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - max_vruntime); + if (delta > 0) + max_vruntime = vruntime; + + return max_vruntime; +} + +static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta < 0) + min_vruntime = vruntime; + + return min_vruntime; +} + +static inline int entity_before(struct sched_entity *a, + struct sched_entity *b) +{ + return (s64)(a->vruntime - b->vruntime) < 0; +} + +static void update_min_vruntime(struct cfs_rq *cfs_rq) +{ + u64 vruntime = cfs_rq->min_vruntime; + + if (cfs_rq->curr) + vruntime = cfs_rq->curr->vruntime; + + if (cfs_rq->rb_leftmost) { + struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, + struct sched_entity, + run_node); + + if (!cfs_rq->curr) + vruntime = se->vruntime; + else + vruntime = min_vruntime(vruntime, se->vruntime); + } + + /* ensure we never gain time by being placed backwards. */ + cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif +} + +/* + * Enqueue an entity into the rb-tree: + */ +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node *parent = NULL; + struct sched_entity *entry; + int leftmost = 1; + + /* + * Find the right place in the rbtree: + */ + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_entity, run_node); + /* + * We dont care about collisions. Nodes with + * the same key stay together. + */ + if (entity_before(se, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + /* + * Maintain a cache of leftmost tree entries (it is frequently + * used): + */ + if (leftmost) + cfs_rq->rb_leftmost = &se->run_node; + + rb_link_node(&se->run_node, parent, link); + rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); +} + +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->rb_leftmost == &se->run_node) { + struct rb_node *next_node; + + next_node = rb_next(&se->run_node); + cfs_rq->rb_leftmost = next_node; + } + + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); +} + +struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *left = cfs_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_entity, run_node); +} + +static struct sched_entity *__pick_next_entity(struct sched_entity *se) +{ + struct rb_node *next = rb_next(&se->run_node); + + if (!next) + return NULL; + + return rb_entry(next, struct sched_entity, run_node); +} + +#ifdef CONFIG_SCHED_DEBUG +struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + + if (!last) + return NULL; + + return rb_entry(last, struct sched_entity, run_node); +} + +/************************************************************** + * Scheduling class statistics methods: + */ + +int sched_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int factor = get_update_sysctl_factor(); + + if (ret || !write) + return ret; + + sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, + sysctl_sched_min_granularity); + +#define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) + WRT_SYSCTL(sched_min_granularity); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); +#undef WRT_SYSCTL + + return 0; +} +#endif + +/* + * delta /= w + */ +static inline unsigned long +calc_delta_fair(unsigned long delta, struct sched_entity *se) +{ + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); + + return delta; +} + +/* + * The idea is to set a period in which each task runs once. + * + * When there are too many tasks (sched_nr_latency) we have to stretch + * this period because otherwise the slices get too small. + * + * p = (nr <= nl) ? l : l*nr/nl + */ +static u64 __sched_period(unsigned long nr_running) +{ + u64 period = sysctl_sched_latency; + unsigned long nr_latency = sched_nr_latency; + + if (unlikely(nr_running > nr_latency)) { + period = sysctl_sched_min_granularity; + period *= nr_running; + } + + return period; +} + +/* + * We calculate the wall-time slice from the period by taking a part + * proportional to the weight. + * + * s = p*P[w/rw] + */ +static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); + + for_each_sched_entity(se) { + struct load_weight *load; + struct load_weight lw; + + cfs_rq = cfs_rq_of(se); + load = &cfs_rq->load; + + if (unlikely(!se->on_rq)) { + lw = cfs_rq->load; + + update_load_add(&lw, se->load.weight); + load = &lw; + } + slice = calc_delta_mine(slice, se->load.weight, load); + } + return slice; +} + +/* + * We calculate the vruntime slice of a to-be-inserted task. + * + * vs = s/w + */ +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return calc_delta_fair(sched_slice(cfs_rq, se), se); +} + + +#ifdef CONFIG_SMP +static inline void __update_task_entity_contrib(struct sched_entity *se); + +static long __update_task_entity_ratio(struct sched_entity *se); + +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ +#define LOAD_AVG_VARIABLE_PERIOD 512 +static unsigned int init_task_load_period = 4000; + +/* Give new task start runnable values to heavy its load in infant time */ +void init_task_runnable_average(struct task_struct *p) +{ + u32 slice; + + p->se.avg.decay_count = 0; + slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; + p->se.avg.runnable_avg_sum = (init_task_load_period) ? 0 : slice; + p->se.avg.runnable_avg_period = (init_task_load_period)?(init_task_load_period):slice; + __update_task_entity_contrib(&p->se); + +#ifdef CONFIG_MTK_SCHED_CMP + /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */ + p->se.avg.usage_avg_sum = (init_task_load_period) ? 0 : slice; +#endif + __update_task_entity_ratio(&p->se); + trace_sched_task_entity_avg(0, p, &p->se.avg); +} +#else +void init_task_runnable_average(struct task_struct *p) +{ +} +#endif + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static inline void +__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, + unsigned long delta_exec) +{ + unsigned long delta_exec_weighted; + + schedstat_set(curr->statistics.exec_max, + max((u64)delta_exec, curr->statistics.exec_max)); + + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); + delta_exec_weighted = calc_delta_fair(delta_exec, curr); + + curr->vruntime += delta_exec_weighted; + update_min_vruntime(cfs_rq); +} + +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + u64 now = rq_of(cfs_rq)->clock_task; + unsigned long delta_exec; + + if (unlikely(!curr)) + return; + + /* + * Get the amount of time the current task was running + * since the last time we changed load (this cannot + * overflow on 32 bits): + */ + delta_exec = (unsigned long)(now - curr->exec_start); + if (!delta_exec) + return; + + __update_curr(cfs_rq, curr, delta_exec); + curr->exec_start = now; + + if (entity_is_task(curr)) { + struct task_struct *curtask = task_of(curr); + + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); + cpuacct_charge(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); + } + + account_cfs_rq_runtime(cfs_rq, delta_exec); +} + +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); +} + +/* + * Task is being enqueued - update stats: + */ +static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * Are we enqueueing a waiting task? (for current tasks + * a dequeue/enqueue event is a NOP) + */ + if (se != cfs_rq->curr) + update_stats_wait_start(cfs_rq, se); +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, + rq_of(cfs_rq)->clock - se->statistics.wait_start)); + schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); + schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + + rq_of(cfs_rq)->clock - se->statistics.wait_start); +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + trace_sched_stat_wait(task_of(se), + rq_of(cfs_rq)->clock - se->statistics.wait_start); + } +#endif + schedstat_set(se->statistics.wait_start, 0); +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * Mark the end of the wait period if dequeueing a + * waiting task: + */ + if (se != cfs_rq->curr) + update_stats_wait_end(cfs_rq, se); +} + +/* + * We are picking a new current task - update its stats: + */ +static inline void +update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * We are starting a new run period: + */ + se->exec_start = rq_of(cfs_rq)->clock_task; +} + +/************************************************** + * Scheduling class queueing methods: + */ + +#ifdef CONFIG_NUMA_BALANCING +/* + * numa task sample period in ms + */ +unsigned int sysctl_numa_balancing_scan_period_min = 100; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; + +/* Portion of address space to scan in MB */ +unsigned int sysctl_numa_balancing_scan_size = 256; + +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ +unsigned int sysctl_numa_balancing_scan_delay = 1000; + +static void task_numa_placement(struct task_struct *p) +{ + int seq; + + if (!p->mm) /* for example, ksmd faulting in a user's mm */ + return; + seq = ACCESS_ONCE(p->mm->numa_scan_seq); + if (p->numa_scan_seq == seq) + return; + p->numa_scan_seq = seq; + + /* FIXME: Scheduling placement policy hints go here */ +} + +/* + * Got a PROT_NONE fault for a page on @node. + */ +void task_numa_fault(int node, int pages, bool migrated) +{ + struct task_struct *p = current; + + if (!sched_feat_numa(NUMA)) + return; + + /* FIXME: Allocate task-specific structure for placement policy here */ + + /* + * If pages are properly placed (did not migrate) then scan slower. + * This is reset periodically in case of phase changes + */ + if (!migrated) + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(10)); + + task_numa_placement(p); +} + +static void reset_ptenuma_scan(struct task_struct *p) +{ + ACCESS_ONCE(p->mm->numa_scan_seq)++; + p->mm->numa_scan_offset = 0; +} + +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +void task_numa_work(struct callback_head *work) +{ + unsigned long migrate, next_scan, now = jiffies; + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + unsigned long start, end; + long pages; + + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + + work->next = work; /* protect against double add */ + /* + * Who cares about NUMA placement when they're dying. + * + * NOTE: make sure not to dereference p->mm before this check, + * exit_task_work() happens _after_ exit_mm() so we could be called + * without p->mm even though we still had it when we enqueued this + * work. + */ + if (p->flags & PF_EXITING) + return; + + /* + * We do not care about task placement until a task runs on a node + * other than the first one used by the address space. This is + * largely because migrations are driven by what CPU the task + * is running on. If it's never scheduled on another node, it'll + * not migrate so why bother trapping the fault. + */ + if (mm->first_nid == NUMA_PTE_SCAN_INIT) + mm->first_nid = numa_node_id(); + if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { + /* Are we running on a new node yet? */ + if (numa_node_id() == mm->first_nid && + !sched_feat_numa(NUMA_FORCE)) + return; + + mm->first_nid = NUMA_PTE_SCAN_ACTIVE; + } + + /* + * Reset the scan period if enough time has gone by. Objective is that + * scanning will be reduced if pages are properly placed. As tasks + * can enter different phases this needs to be re-examined. Lacking + * proper tracking of reference behaviour, this blunt hammer is used. + */ + migrate = mm->numa_next_reset; + if (time_after(now, migrate)) { + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + xchg(&mm->numa_next_reset, next_scan); + } + + /* + * Enforce maximal scan/migration frequency.. + */ + migrate = mm->numa_next_scan; + if (time_before(now, migrate)) + return; + + if (p->numa_scan_period == 0) + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + + next_scan = now + msecs_to_jiffies(p->numa_scan_period); + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + return; + + /* + * Do not set pte_numa if the current running node is rate-limited. + * This loses statistics on the fault but if we are unwilling to + * migrate to this node, it is less likely we can do useful work + */ + if (migrate_ratelimited(numa_node_id())) + return; + + start = mm->numa_scan_offset; + pages = sysctl_numa_balancing_scan_size; + pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + if (!pages) + return; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + if (!vma) { + reset_ptenuma_scan(p); + start = 0; + vma = mm->mmap; + } + for (; vma; vma = vma->vm_next) { + if (!vma_migratable(vma)) + continue; + + /* Skip small VMAs. They are not likely to be of relevance */ + if (vma->vm_end - vma->vm_start < HPAGE_SIZE) + continue; + + /* + * Skip inaccessible VMAs to avoid any confusion between + * PROT_NONE and NUMA hinting ptes + */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + continue; + + do { + start = max(start, vma->vm_start); + end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); + end = min(end, vma->vm_end); + pages -= change_prot_numa(vma, start, end); + + start = end; + if (pages <= 0) + goto out; + } while (end != vma->vm_end); + } + +out: + /* + * It is possible to reach the end of the VMA list but the last few VMAs are + * not guaranteed to the vma_migratable. If they are not, we would find the + * !migratable VMA on the next scan but not reset the scanner to the start + * so check it now. + */ + if (vma) + mm->numa_scan_offset = start; + else + reset_ptenuma_scan(p); + up_read(&mm->mmap_sem); +} + +/* + * Drive the periodic memory faults.. + */ +void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->numa_work; + u64 period, now; + + /* + * We don't care about NUMA placement if we don't have memory. + */ + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + return; + + /* + * Using runtime rather than walltime has the dual advantage that + * we (mostly) drive the selection from busy threads and that the + * task needs to have done some actual work before we bother with + * NUMA placement. + */ + now = curr->se.sum_exec_runtime; + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; + + if (now - curr->node_stamp > period) { + if (!curr->node_stamp) + curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; + curr->node_stamp = now; + + if (!time_before(jiffies, curr->mm->numa_next_scan)) { + init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + task_work_add(curr, work, true); + } + } +} +#else +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + +static void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + update_load_add(&rq_of(cfs_rq)->load, se->load.weight); +#ifdef CONFIG_SMP + if (entity_is_task(se)) + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); +#endif + cfs_rq->nr_running++; +} + +static void +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); + if (entity_is_task(se)) + list_del_init(&se->group_node); + cfs_rq->nr_running--; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +# ifdef CONFIG_SMP +static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +{ + long tg_weight; + + /* + * Use this CPU's actual weight instead of the last load_contribution + * to gain a more accurate current total weight. See + * update_cfs_rq_load_contribution(). + */ + tg_weight = atomic_long_read(&tg->load_avg); + tg_weight -= cfs_rq->tg_load_contrib; + tg_weight += cfs_rq->load.weight; + + return tg_weight; +} + +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + long tg_weight, load, shares; + + tg_weight = calc_tg_weight(tg, cfs_rq); + load = cfs_rq->load.weight; + + shares = (tg->shares * load); + if (tg_weight) + shares /= tg_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + return shares; +} +# else /* CONFIG_SMP */ +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + return tg->shares; +} +# endif /* CONFIG_SMP */ +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + account_entity_dequeue(cfs_rq, se); + } + + update_load_set(&se->load, weight); + + if (se->on_rq) + account_entity_enqueue(cfs_rq, se); +} + +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + +static void update_cfs_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg; + struct sched_entity *se; + long shares; + + tg = cfs_rq->tg; + se = tg->se[cpu_of(rq_of(cfs_rq))]; + if (!se || throttled_hierarchy(cfs_rq)) + return; +#ifndef CONFIG_SMP + if (likely(se->load.weight == tg->shares)) + return; +#endif + shares = calc_cfs_shares(cfs_rq, tg); + + reweight_entity(cfs_rq_of(se), se, shares); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_SMP +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +//#define LOAD_AVG_PERIOD 32 +//#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +//#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ + +/* Precomputed fixed inverse multiplies for multiplication by y^n */ +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +/* + * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent + * over-estimates when re-combining. + */ +static const u32 runnable_avg_yN_sum[] = { + 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, + 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, + 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, +}; + +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n) +{ + unsigned int local_n; + + if (!n) + return val; + else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + return 0; + + /* after bounds checking we can collapse to 32-bit */ + local_n = n; + + /* + * As y^PERIOD = 1/2, we can combine + * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) + * With a look-up table which covers k^n (n<PERIOD) + * + * To achieve constant time decay_load. + */ + if (unlikely(local_n >= LOAD_AVG_PERIOD)) { + val >>= local_n / LOAD_AVG_PERIOD; + local_n %= LOAD_AVG_PERIOD; + } + + val *= runnable_avg_yN_inv[local_n]; + /* We don't use SRR here since we always want to round down. */ + return val >> 32; +} + +/* + * For updates fully spanning n periods, the contribution to runnable + * average will be: \Sum 1024*y^n + * + * We can compute this reasonably efficiently by combining: + * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} + */ +static u32 __compute_runnable_contrib(u64 n) +{ + u32 contrib = 0; + + if (likely(n <= LOAD_AVG_PERIOD)) + return runnable_avg_yN_sum[n]; + else if (unlikely(n >= LOAD_AVG_MAX_N)) + return LOAD_AVG_MAX; + + /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ + do { + contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ + contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; + + n -= LOAD_AVG_PERIOD; + } while (n > LOAD_AVG_PERIOD); + + contrib = decay_load(contrib, n); + return contrib + runnable_avg_yN_sum[n]; +} + +#ifdef CONFIG_HMP_VARIABLE_SCALE + +#define HMP_VARIABLE_SCALE_SHIFT 16ULL +struct hmp_global_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *a, struct attribute *b, + const char *c, size_t count); + int *value; + int (*to_sysfs)(int); + int (*from_sysfs)(int); +}; + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +#define HMP_DATA_SYSFS_MAX 5 +#else +#define HMP_DATA_SYSFS_MAX 4 +#endif + +struct hmp_data_struct { +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + int freqinvar_load_scale_enabled; +#endif + int multiplier; /* used to scale the time delta */ + struct attribute_group attr_group; + struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1]; + struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX]; +} hmp_data; + +static u64 hmp_variable_scale_convert(u64 delta); +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* Frequency-Invariant Load Modification: + * Loads are calculated as in PJT's patch however we also scale the current + * contribution in line with the frequency of the CPU that the task was + * executed on. + * In this version, we use a simple linear scale derived from the maximum + * frequency reported by CPUFreq. As an example: + * + * Consider that we ran a task for 100% of the previous interval. + * + * Our CPU was under asynchronous frequency control through one of the + * CPUFreq governors. + * + * The CPUFreq governor reports that it is able to scale the CPU between + * 500MHz and 1GHz. + * + * During the period, the CPU was running at 1GHz. + * + * In this case, our load contribution for that period is calculated as + * 1 * (number_of_active_microseconds) + * + * This results in our task being able to accumulate maximum load as normal. + * + * + * Consider now that our CPU was executing at 500MHz. + * + * We now scale the load contribution such that it is calculated as + * 0.5 * (number_of_active_microseconds) + * + * Our task can only record 50% maximum load during this period. + * + * This represents the task consuming 50% of the CPU's *possible* compute + * capacity. However the task did consume 100% of the CPU's *available* + * compute capacity which is the value seen by the CPUFreq governor and + * user-side CPU Utilization tools. + * + * Restricting tracked load to be scaled by the CPU's frequency accurately + * represents the consumption of possible compute capacity and allows the + * HMP migration's simple threshold migration strategy to interact more + * predictably with CPUFreq's asynchronous compute capacity changes. + */ +#define SCHED_FREQSCALE_SHIFT 10 +struct cpufreq_extents { + u32 curr_scale; + u32 min; + u32 max; + u32 flags; +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + u32 const_max; + u32 throttling; +#endif +}; +/* Flag set when the governor in use only allows one frequency. + * Disables scaling. + */ +#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01 + +static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS]; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ +#endif /* CONFIG_HMP_VARIABLE_SCALE */ + +#ifdef CONFIG_MTK_SCHED_CMP +int get_cluster_id(unsigned int cpu) +{ + return arch_get_cluster_id(cpu); +} + +void get_cluster_cpus(struct cpumask *cpus, int cluster_id, + bool exclusive_offline) +{ + struct cpumask cls_cpus; + + arch_get_cluster_cpus(&cls_cpus, cluster_id); + if (exclusive_offline) { + cpumask_and(cpus, cpu_online_mask, &cls_cpus); + } else + cpumask_copy(cpus, &cls_cpus); +} + +static int nr_cpus_in_cluster(int cluster_id, bool exclusive_offline) +{ + struct cpumask cls_cpus; + int nr_cpus; + + arch_get_cluster_cpus(&cls_cpus, cluster_id); + if (exclusive_offline) { + struct cpumask online_cpus; + cpumask_and(&online_cpus, cpu_online_mask, &cls_cpus); + nr_cpus = cpumask_weight(&online_cpus); + } else + nr_cpus = cpumask_weight(&cls_cpus); + + return nr_cpus; +} +#endif /* CONFIG_MTK_SCHED_CMP */ + +void sched_get_big_little_cpus(struct cpumask *big, struct cpumask *little) +{ + arch_get_big_little_cpus(big, little); +} +EXPORT_SYMBOL(sched_get_big_little_cpus); + +/* + * generic entry point for cpu mask construction, dedicated for + * mediatek scheduler. + */ +static __init __inline void cmp_cputopo_domain_setup(void) +{ + WARN(smp_processor_id() != 0, "%s is supposed runs on CPU0 " + "while kernel init", __func__); +#ifdef CONFIG_MTK_CPU_TOPOLOGY + /* + * sched_init + * |-> cmp_cputopo_domain_seutp() + * ... + * rest_init + * ^ fork kernel_init + * |-> kernel_init_freeable + * ... + * |-> arch_build_cpu_topology_domain + * + * here, we focus to build up cpu topology and domain before scheduler runs. + */ + pr_debug("[CPUTOPO][%s] build CPU topology and cluster.\n", __func__); + arch_build_cpu_topology_domain(); +#endif +} + +#ifdef CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY +static u64 __inline variable_scale_convert(u64 delta) +{ + u64 high = delta >> 32ULL; + u64 low = delta & 0xffffffffULL; + low *= LOAD_AVG_VARIABLE_PERIOD; + high *= LOAD_AVG_VARIABLE_PERIOD; + return (low >> 16ULL) + (high << (32ULL - 16ULL)); +} +#endif + +/* We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p2 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, + struct sched_avg *sa, + int runnable, + int running, + int cpu) +{ + u64 delta, periods, lru; + u32 runnable_contrib; + int delta_w, decayed = 0; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + u64 scaled_delta; + u32 scaled_runnable_contrib; + int scaled_delta_w; + u32 curr_scale = 1024; +#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY) + u64 scaled_delta; + u32 scaled_runnable_contrib; + int scaled_delta_w; + u32 curr_scale = CPUPOWER_FREQSCALE_DEFAULT; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ + + delta = now - sa->last_runnable_update; + lru = sa->last_runnable_update; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_runnable_update = now; + return 0; + } + +#ifdef CONFIG_HMP_VARIABLE_SCALE + delta = hmp_variable_scale_convert(delta); +#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY) + delta = variable_scale_convert(delta); +#endif + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + sa->last_runnable_update = now; + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + WARN(cpu < 0, "[%s] CPU %d < 0 !!!\n", __func__, cpu); + /* retrieve scale factor for load */ + if (cpu >= 0 && cpu < nr_cpu_ids && hmp_data.freqinvar_load_scale_enabled) + curr_scale = freq_scale[cpu].curr_scale; + mt_sched_printf(sched_log, "[%s] cpu=%d delta=%llu now=%llu last=%llu curr_scale=%u", + __func__, cpu, delta, now, lru, curr_scale); +#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY) + WARN(cpu < 0, "[%s] CPU %d < 0 !!!\n", __func__, cpu); + /* retrieve scale factor for load */ + if (cpu >= 0 && cpu < nr_cpu_ids) + curr_scale = (topology_cpu_capacity(cpu) << CPUPOWER_FREQSCALE_SHIFT) + / (topology_max_cpu_capacity(cpu)+1); + mt_sched_printf(sched_log, "[%s] cpu=%d delta=%llu now=%llu last=%llu curr_scale=%u", + __func__, cpu, delta, now, lru, curr_scale); +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ + + /* delta_w is the amount already accumulated against our next period */ + delta_w = sa->runnable_avg_period % 1024; + if (delta + delta_w >= 1024) { + /* period roll-over */ + decayed = 1; + + /* + * Now that we know we're crossing a period boundary, figure + * out how much from delta we need to complete the current + * period and accrue it. + */ + delta_w = 1024 - delta_w; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* scale runnable time if necessary */ + scaled_delta_w = (delta_w * curr_scale) + >> SCHED_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_delta_w; + if (running) + sa->usage_avg_sum += scaled_delta_w; +#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY) + /* scale runnable time if necessary */ + scaled_delta_w = (delta_w * curr_scale) + >> CPUPOWER_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_delta_w; + if (running) + sa->usage_avg_sum += scaled_delta_w; +#else + if (runnable) + sa->runnable_avg_sum += delta_w; + if (running) + sa->usage_avg_sum += delta_w; +#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ + sa->runnable_avg_period += delta_w; + + delta -= delta_w; + + /* Figure out how many additional periods this update spans */ + periods = delta / 1024; + delta %= 1024; + /* decay the load we have accumulated so far */ + sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, + periods + 1); + sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + periods + 1); + sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1); + /* add the contribution from this period */ + /* Efficiently calculate \sum (1..n_period) 1024*y^i */ + runnable_contrib = __compute_runnable_contrib(periods); +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* Apply load scaling if necessary. + * Note that multiplying the whole series is same as + * multiplying all terms + */ + scaled_runnable_contrib = (runnable_contrib * curr_scale) + >> SCHED_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_runnable_contrib; + if (running) + sa->usage_avg_sum += scaled_runnable_contrib; +#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY) + /* Apply load scaling if necessary. + * Note that multiplying the whole series is same as + * multiplying all terms + */ + scaled_runnable_contrib = (runnable_contrib * curr_scale) + >> CPUPOWER_FREQSCALE_SHIFT; + if (runnable) + sa->runnable_avg_sum += scaled_runnable_contrib; + if (running) + sa->usage_avg_sum += scaled_runnable_contrib; +#else + if (runnable) + sa->runnable_avg_sum += runnable_contrib; + if (running) + sa->usage_avg_sum += runnable_contrib; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ + sa->runnable_avg_period += runnable_contrib; + } + + /* Remainder of delta accrued against u_0` */ +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* scale if necessary */ + scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT); + if (runnable) + sa->runnable_avg_sum += scaled_delta; + if (running) + sa->usage_avg_sum += scaled_delta; +#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY) + /* scale if necessary */ + scaled_delta = ((delta * curr_scale) >> CPUPOWER_FREQSCALE_SHIFT); + if (runnable) + sa->runnable_avg_sum += scaled_delta; + if (running) + sa->usage_avg_sum += scaled_delta; +#else + if (runnable) + sa->runnable_avg_sum += delta; + if (running) + sa->usage_avg_sum += delta; +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ + sa->runnable_avg_period += delta; + + return decayed; +} + +/* Synchronize an entity's decay with its parenting cfs_rq.*/ +static inline u64 __synchronize_entity_decay(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 decays = atomic64_read(&cfs_rq->decay_counter); + + decays -= se->avg.decay_count; + if (!decays) + return 0; + + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.decay_count = 0; + + return decays; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) +{ + struct task_group *tg = cfs_rq->tg; + long tg_contrib; + + tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; + tg_contrib -= cfs_rq->tg_load_contrib; + + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { + atomic_long_add(tg_contrib, &tg->load_avg); + cfs_rq->tg_load_contrib += tg_contrib; + } +} + +/* + * Aggregate cfs_rq runnable averages into an equivalent task_group + * representation for computing load contributions. + */ +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + long contrib, usage_contrib; + + /* The fraction of a cpu used by this cfs_rq */ + contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + contrib -= cfs_rq->tg_runnable_contrib; + + usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT, + sa->runnable_avg_period + 1); + usage_contrib -= cfs_rq->tg_usage_contrib; + + /* + * contrib/usage at this point represent deltas, only update if they + * are substantive. + */ + if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) || + (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) { + atomic_add(contrib, &tg->runnable_avg); + cfs_rq->tg_runnable_contrib += contrib; + + atomic_add(usage_contrib, &tg->usage_avg); + cfs_rq->tg_usage_contrib += usage_contrib; + } +} + +static inline void __update_group_entity_contrib(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + struct task_group *tg = cfs_rq->tg; + int runnable_avg; + + u64 contrib; + + contrib = cfs_rq->tg_load_contrib * tg->shares; + se->avg.load_avg_contrib = div_u64(contrib, + atomic_long_read(&tg->load_avg) + 1); + + /* + * For group entities we need to compute a correction term in the case + * that they are consuming <1 cpu so that we would contribute the same + * load as a task of equal weight. + * + * Explicitly co-ordinating this measurement would be expensive, but + * fortunately the sum of each cpus contribution forms a usable + * lower-bound on the true value. + * + * Consider the aggregate of 2 contributions. Either they are disjoint + * (and the sum represents true value) or they are disjoint and we are + * understating by the aggregate of their overlap. + * + * Extending this to N cpus, for a given overlap, the maximum amount we + * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of + * cpus that overlap for this interval and w_i is the interval width. + * + * On a small machine; the first term is well-bounded which bounds the + * total error since w_i is a subset of the period. Whereas on a + * larger machine, while this first term can be larger, if w_i is the + * of consequential size guaranteed to see n_i*w_i quickly converge to + * our upper bound of 1-cpu. + */ + runnable_avg = atomic_read(&tg->runnable_avg); + if (runnable_avg < NICE_0_LOAD) { + se->avg.load_avg_contrib *= runnable_avg; + se->avg.load_avg_contrib >>= NICE_0_SHIFT; + } +} +#else +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, + int force_update) {} +static inline void __update_tg_runnable_avg(struct sched_avg *sa, + struct cfs_rq *cfs_rq) {} +static inline void __update_group_entity_contrib(struct sched_entity *se) {} +#endif + +static inline void __update_task_entity_contrib(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); + contrib /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_contrib = scale_load(contrib); +} + +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.load_avg_contrib; + + if (entity_is_task(se)) { + __update_task_entity_contrib(se); + } else { + __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); + __update_group_entity_contrib(se); + } + + return se->avg.load_avg_contrib - old_contrib; +} + +#if defined(CONFIG_MTK_SCHED_CMP) || defined(CONFIG_SCHED_HMP_ENHANCEMENT) +/* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */ +static long __update_task_entity_ratio(struct sched_entity *se) +{ + long old_ratio = se->avg.load_avg_ratio; + u32 ratio; + + ratio = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD); + ratio /= (se->avg.runnable_avg_period + 1); + se->avg.load_avg_ratio = scale_load(ratio); + + return se->avg.load_avg_ratio - old_ratio; +} +#else +static inline long __update_task_entity_ratio(struct sched_entity *se) { return 0; } +#endif + +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, + long load_contrib) +{ + if (likely(load_contrib < cfs_rq->blocked_load_avg)) + cfs_rq->blocked_load_avg -= load_contrib; + else + cfs_rq->blocked_load_avg = 0; +} + +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER +unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL); +#endif + +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT +/* Schedule entity */ +#define se_pid(se) ((se != NULL && entity_is_task(se))? \ + container_of(se,struct task_struct,se)->pid:-1) +#define se_load(se) se->avg.load_avg_ratio +#define se_contrib(se) se->avg.load_avg_contrib + +/* CPU related : load information */ +#define cfs_pending_load(cpu) cpu_rq(cpu)->cfs.avg.pending_load +#define cfs_load(cpu) cpu_rq(cpu)->cfs.avg.load_avg_ratio +#define cfs_contrib(cpu) cpu_rq(cpu)->cfs.avg.load_avg_contrib + +/* CPU related : the number of tasks */ +#define cfs_nr_normal_prio(cpu) cpu_rq(cpu)->cfs.avg.nr_normal_prio +#define cfs_nr_pending(cpu) cpu_rq(cpu)->cfs.avg.nr_pending +#define cfs_length(cpu) cpu_rq(cpu)->cfs.h_nr_running +#define rq_length(cpu) (cpu_rq(cpu)->nr_running + cfs_nr_pending(cpu)) + +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER +#define task_low_priority(prio) ((prio >= hmp_up_prio)?1:0) +#define cfs_nr_dequeuing_low_prio(cpu) \ + cpu_rq(cpu)->cfs.avg.nr_dequeuing_low_prio +#define cfs_reset_nr_dequeuing_low_prio(cpu) \ + (cfs_nr_dequeuing_low_prio(cpu) = 0) +#else +#define task_low_priority(prio) (0) +#define cfs_reset_nr_dequeuing_low_prio(cpu) +#endif /* CONFIG_SCHED_HMP_PRIO_FILTER */ +#endif /* CONFIG_SCHED_HMP_ENHANCEMENT */ + +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); + +#ifdef CONFIG_MTK_SCHED_CMP_TGS +int group_leader_is_empty(struct task_struct *p) { + + struct task_struct *tg = p->group_leader; + + if (SIGNAL_GROUP_EXIT & p->signal->flags){ + // pr_warn("[%s] (0x%p/0x%p)(#%d/%s) leader: pid(%d) state(%d) exit_state(%d)signal_flags=%x p->signal->flags=%x group_exit_code=%x\n", __func__, + // p, tg, get_nr_threads(p), thread_group_empty(p) ? "empty" : "not empty", + // p->tgid, tg->state, tg->exit_state, tg->state, p->signal->flags, p->signal->group_exit_code); + return 1; + } + + // workaround debug codes + if(tg->state == 0x6b6b6b6b){ + // pr_warn("[%s] (0x%p/0x%p)(#%d/%s) leader: state(%d) exit_state(%d)\n", __func__, + // p, tg, get_nr_threads(p), thread_group_empty(p) ? "empty" : "not empty", + // tg->state, tg->exit_state); + return 1; + } + + return 0; +} + +static inline void update_tg_info(struct cfs_rq *cfs_rq, struct sched_entity *se, long ratio_delta) +{ + struct task_struct *p = task_of(se); + struct task_struct *tg = p->group_leader; + int id; + unsigned long flags; + + if (group_leader_is_empty(p)) + return; + id = get_cluster_id(cfs_rq->rq->cpu); + if (unlikely(WARN_ON(id < 0))) + return; + + raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags); + tg->thread_group_info[id].load_avg_ratio += ratio_delta; + raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags); + + mt_sched_printf(sched_cmp, "update_tg_info %d:%s %d:%s %ld %ld %d %d %lu:%lu:%lu update", + tg->pid, tg->comm, p->pid, p->comm, + se->avg.load_avg_ratio, ratio_delta, + cfs_rq->rq->cpu, id, + tg->thread_group_info[id].nr_running, + tg->thread_group_info[id].cfs_nr_running, + tg->thread_group_info[id].load_avg_ratio); + +} +#endif + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + long contrib_delta; + u64 now; + long ratio_delta = 0; + int cpu = -1; /* not used in normal case */ + +#if defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) \ + || defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY) + cpu = cfs_rq->rq->cpu; +#endif + + /* + * For a group entity we need to use their owned cfs_rq_clock_task() in + * case they are the parent of a throttled hierarchy. + */ + if (entity_is_task(se)) + now = cfs_rq_clock_task(cfs_rq); + else + now = cfs_rq_clock_task(group_cfs_rq(se)); + + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + cfs_rq->curr == se, cpu)) { +#if 0 + if (entity_is_task(se)) { + ratio_delta = __update_task_entity_ratio(se); + if (update_cfs_rq) + { + cpu = cfs_rq->rq->cpu; + cpu_rq(cpu)->cfs.avg.load_avg_ratio += ratio_delta; +#ifdef CONFIG_HMP_TRACER + trace_sched_cfs_load_update(task_of(se),se_load(se),ratio_delta, cpu); +#endif /* CONFIG_HMP_TRACER */ + } + + trace_sched_task_entity_avg(2, task_of(se), &se->avg); +#ifdef CONFIG_MTK_SCHED_CMP_TGS + if (se->on_rq) { + update_tg_info(cfs_rq, se, ratio_delta); + } +#endif + } +#endif + return; + } + + contrib_delta = __update_entity_load_avg_contrib(se); + + /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */ + if (entity_is_task(se)) { + ratio_delta = __update_task_entity_ratio(se); + /* + * ratio is re-estimated just for entity of task; as + * for contrib, mark tracer here for task entity while + * mining tg's at __update_group_entity_contrib(). + * + * track running usage in passing. + */ + trace_sched_task_entity_avg(3, task_of(se), &se->avg); + } + + if (!update_cfs_rq) + return; + + if (se->on_rq) { + cfs_rq->runnable_load_avg += contrib_delta; + if (entity_is_task(se)) { + cpu = cfs_rq->rq->cpu; + cpu_rq(cpu)->cfs.avg.load_avg_ratio += ratio_delta; + cpu_rq(cpu)->cfs.avg.load_avg_contrib += contrib_delta; +#ifdef CONFIG_HMP_TRACER + trace_sched_cfs_load_update(task_of(se),se_load(se),ratio_delta,cpu); +#endif /* CONFIG_HMP_TRACER */ +#ifdef CONFIG_MTK_SCHED_CMP_TGS + update_tg_info(cfs_rq, se, ratio_delta); +#endif + } + } + else + subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + + +/* + * Decay the load contributed by all blocked children and account this so that + * their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +{ + u64 now = cfs_rq_clock_task(cfs_rq) >> 20; + u64 decays; + + decays = now - cfs_rq->last_decay; + if (!decays && !force_update) + return; + + if (atomic_long_read(&cfs_rq->removed_load)) { + unsigned long removed_load; + removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); + subtract_blocked_load_contrib(cfs_rq, removed_load); + } + + if (decays) { + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + cfs_rq->last_decay = now; + } + + __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); +} + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + u32 contrib; + int cpu = -1; /* not used in normal case */ + +#if defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) \ + || defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY) + cpu = rq->cpu; +#endif + __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable, + runnable, cpu); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); + contrib = rq->avg.runnable_avg_sum * scale_load_down(1024); + contrib /= (rq->avg.runnable_avg_period + 1); + trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib)); + trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg); +} + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) +{ + int cpu = cfs_rq->rq->cpu; + + /* + * We track migrations using entity decay_count <= 0, on a wake-up + * migration we use a negative decay count to track the remote decays + * accumulated while sleeping. + * + * Newly forked tasks are enqueued with se->avg.decay_count == 0, they + * are seen by enqueue_entity_load_avg() as a migration with an already + * constructed load_avg_contrib. + */ + if (unlikely(se->avg.decay_count <= 0)) { + se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + if (se->avg.decay_count) { + /* + * In a wake-up migration we have to approximate the + * time sleeping. This is because we can't synchronize + * clock_task between the two cpus, and it is not + * guaranteed to be read-safe. Instead, we can + * approximate this using our carried decays, which are + * explicitly atomically readable. + */ + se->avg.last_runnable_update -= (-se->avg.decay_count) + << 20; + update_entity_load_avg(se, 0); + /* Indicate that we're now synchronized and on-rq */ + se->avg.decay_count = 0; +#ifdef CONFIG_MTK_SCHED_CMP + } else { + if (entity_is_task(se)) + trace_sched_task_entity_avg(1, task_of(se), &se->avg); +#endif + } + wakeup = 0; + } else { + __synchronize_entity_decay(se); + } + + /* migrated tasks did not contribute to our blocked load */ + if (wakeup) { + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + update_entity_load_avg(se, 0); + } + + cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; +#ifdef CONFIG_MTK_SCHED_CMP_TGS + if(entity_is_task(se)){ + update_tg_info(cfs_rq, se, se->avg.load_avg_ratio); + } +#endif + + if (entity_is_task(se)) { + cpu_rq(cpu)->cfs.avg.load_avg_contrib += se->avg.load_avg_contrib; + cpu_rq(cpu)->cfs.avg.load_avg_ratio += se->avg.load_avg_ratio; +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + cfs_nr_pending(cpu) = 0; + cfs_pending_load(cpu) = 0; +#endif +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + if(!task_low_priority(task_of(se)->prio)) + cfs_nr_normal_prio(cpu)++; +#endif +#ifdef CONFIG_HMP_TRACER + trace_sched_cfs_enqueue_task(task_of(se),se_load(se),cpu); +#endif + } + + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !wakeup); +} + +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) +{ + int cpu = cfs_rq->rq->cpu; + + update_entity_load_avg(se, 1); + /* we force update consideration on load-balancer moves */ + update_cfs_rq_blocked_load(cfs_rq, !sleep); + + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; +#ifdef CONFIG_MTK_SCHED_CMP_TGS + if(entity_is_task(se)){ + update_tg_info(cfs_rq, se, -se->avg.load_avg_ratio); + } +#endif + + if (entity_is_task(se)) { + cpu_rq(cpu)->cfs.avg.load_avg_contrib -= se->avg.load_avg_contrib; + cpu_rq(cpu)->cfs.avg.load_avg_ratio -= se->avg.load_avg_ratio; +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + cfs_reset_nr_dequeuing_low_prio(cpu); + if(!task_low_priority(task_of(se)->prio)) + cfs_nr_normal_prio(cpu)--; +#endif +#ifdef CONFIG_HMP_TRACER + trace_sched_cfs_dequeue_task(task_of(se),se_load(se),cpu); +#endif + } + + if (sleep) { + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + } /* migrations, e.g. sleep=0 leave decay_count == 0 */ +} + +/* + * Update the rq's load with the elapsed running time before entering + * idle. if the last scheduled task is not a CFS task, idle_enter will + * be the only way to update the runnable statistic. + */ +void idle_enter_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 1); +} + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 0); +} + +#else +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int wakeup) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se, + int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, + int force_update) {} +#endif + +static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHEDSTATS + struct task_struct *tsk = NULL; + + if (entity_is_task(se)) + tsk = task_of(se); + + if (se->statistics.sleep_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->statistics.sleep_max)) + se->statistics.sleep_max = delta; + + se->statistics.sleep_start = 0; + se->statistics.sum_sleep_runtime += delta; + + if (tsk) { + account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } + } + if (se->statistics.block_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->statistics.block_max)) + se->statistics.block_max = delta; + + se->statistics.block_start = 0; + se->statistics.sum_sleep_runtime += delta; + + if (tsk) { + if (tsk->in_iowait) { + se->statistics.iowait_sum += delta; + se->statistics.iowait_count++; + trace_sched_stat_iowait(tsk, delta); + } + + trace_sched_stat_blocked(tsk, delta); + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); + } + } +#endif +} + +static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + s64 d = se->vruntime - cfs_rq->min_vruntime; + + if (d < 0) + d = -d; + + if (d > 3*sysctl_sched_latency) + schedstat_inc(cfs_rq, nr_spread_over); +#endif +} + +static void +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +{ + u64 vruntime = cfs_rq->min_vruntime; + + /* + * The 'current' period is already promised to the current tasks, + * however the extra weight of the new task will slow them down a + * little, place the new task so that it fits in the slot that + * stays open at the end. + */ + if (initial && sched_feat(START_DEBIT)) + vruntime += sched_vslice(cfs_rq, se); + + /* sleeps up to a single latency don't count. */ + if (!initial) { + unsigned long thresh = sysctl_sched_latency; + + /* + * Halve their sleep time's effect, to allow + * for a gentler effect of sleepers: + */ + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; + + vruntime -= thresh; + } + + /* ensure we never gain time by being placed backwards. */ + se->vruntime = max_vruntime(se->vruntime, vruntime); +} + +static void check_enqueue_throttle(struct cfs_rq *cfs_rq); + +static void +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + /* + * Update the normalized vruntime before updating min_vruntime + * through calling update_curr(). + */ + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) + se->vruntime += cfs_rq->min_vruntime; + + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); + account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); + + if (flags & ENQUEUE_WAKEUP) { + place_entity(cfs_rq, se, 0); + enqueue_sleeper(cfs_rq, se); + } + + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + se->on_rq = 1; + + if (cfs_rq->nr_running == 1) { + list_add_leaf_cfs_rq(cfs_rq); + check_enqueue_throttle(cfs_rq); + } +} + +static void __clear_buddies_last(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->last == se) + cfs_rq->last = NULL; + else + break; + } +} + +static void __clear_buddies_next(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->next == se) + cfs_rq->next = NULL; + else + break; + } +} + +static void __clear_buddies_skip(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->skip == se) + cfs_rq->skip = NULL; + else + break; + } +} + +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->last == se) + __clear_buddies_last(se); + + if (cfs_rq->next == se) + __clear_buddies_next(se); + + if (cfs_rq->skip == se) + __clear_buddies_skip(se); +} + +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +static void +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); + + update_stats_dequeue(cfs_rq, se); + if (flags & DEQUEUE_SLEEP) { +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->statistics.sleep_start = rq_of(cfs_rq)->clock; + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->statistics.block_start = rq_of(cfs_rq)->clock; + } +#endif + } + + clear_buddies(cfs_rq, se); + + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + account_entity_dequeue(cfs_rq, se); + + /* + * Normalize the entity after updating the min_vruntime because the + * update can refer to the ->curr item and we need to reflect this + * movement in our normalized position. + */ + if (!(flags & DEQUEUE_SLEEP)) + se->vruntime -= cfs_rq->min_vruntime; + + /* return excess runtime on last dequeue */ + return_cfs_rq_runtime(cfs_rq); + + update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq); +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + unsigned long ideal_runtime, delta_exec; + struct sched_entity *se; + s64 delta; + + ideal_runtime = sched_slice(cfs_rq, curr); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) { + resched_task(rq_of(cfs_rq)->curr); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. + */ + clear_buddies(cfs_rq, curr); + return; + } + + /* + * Ensure that a task that missed wakeup preemption by a + * narrow margin doesn't have to wait for a full slice. + * This also mitigates buddy induced latencies under load. + */ + if (delta_exec < sysctl_sched_min_granularity) + return; + + se = __pick_first_entity(cfs_rq); + delta = curr->vruntime - se->vruntime; + + if (delta < 0) + return; + + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); +} + +static void +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* 'current' is not kept within the tree. */ + if (se->on_rq) { + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. + */ + update_stats_wait_end(cfs_rq, se); + __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); + } + + update_stats_curr_start(cfs_rq, se); + cfs_rq->curr = se; +#ifdef CONFIG_SCHEDSTATS + /* + * Track our maximum slice length, if the CPU's load is at + * least twice that of our own weight (i.e. dont track it + * when there are only lesser-weight tasks around): + */ + if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + se->statistics.slice_max = max(se->statistics.slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime); + } +#endif + se->prev_sum_exec_runtime = se->sum_exec_runtime; +} + +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + +/* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups + * 2) pick the "next" process, since someone really wants that to run + * 3) pick the "last" process, for cache locality + * 4) do not run the "skip" process, if something else is available + */ +static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +{ + struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *left = se; + + /* + * Avoid running the skip buddy, if running something else can + * be done without getting too unfair. + */ + if (cfs_rq->skip == se) { + struct sched_entity *second = __pick_next_entity(se); + if (second && wakeup_preempt_entity(second, left) < 1) + se = second; + } + + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) + se = cfs_rq->last; + + /* + * Someone really wants this to run. If it's not unfair, run it. + */ + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; + + clear_buddies(cfs_rq, se); + + return se; +} + +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) +{ + /* + * If still on the runqueue then deactivate_task() + * was not called and update_curr() has to be done: + */ + if (prev->on_rq) + update_curr(cfs_rq); + + /* throttle cfs_rqs exceeding runtime */ + check_cfs_rq_runtime(cfs_rq); + + check_spread(cfs_rq, prev); + if (prev->on_rq) { + update_stats_wait_start(cfs_rq, prev); + /* Put 'current' back into the tree. */ + __enqueue_entity(cfs_rq, prev); + /* in !on_rq case, update occurred at dequeue */ + update_entity_load_avg(prev, 1); + } + cfs_rq->curr = NULL; +} + +static void +entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + + /* + * Ensure that runnable average is periodically updated. + */ + update_entity_load_avg(curr, 1); + update_cfs_rq_blocked_load(cfs_rq, 1); + update_cfs_shares(cfs_rq); + +#ifdef CONFIG_SCHED_HRTICK + /* + * queued ticks are scheduled to match the slice, so don't bother + * validating it and just reschedule. + */ + if (queued) { + resched_task(rq_of(cfs_rq)->curr); + return; + } + /* + * don't let the period tick interfere with the hrtick preemption + */ + if (!sched_feat(DOUBLE_TICK) && + hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) + return; +#endif + + if (cfs_rq->nr_running > 1) + check_preempt_tick(cfs_rq, curr); +} + + +/************************************************** + * CFS bandwidth control machinery + */ + +#ifdef CONFIG_CFS_BANDWIDTH + +#ifdef HAVE_JUMP_LABEL +static struct static_key __cfs_bandwidth_used; + +static inline bool cfs_bandwidth_used(void) +{ + return static_key_false(&__cfs_bandwidth_used); +} + +void cfs_bandwidth_usage_inc(void) +{ + static_key_slow_inc(&__cfs_bandwidth_used); +} + +void cfs_bandwidth_usage_dec(void) +{ + static_key_slow_dec(&__cfs_bandwidth_used); +} +#else /* HAVE_JUMP_LABEL */ +static bool cfs_bandwidth_used(void) +{ + return true; +} + +void cfs_bandwidth_usage_inc(void) {} +void cfs_bandwidth_usage_dec(void) {} +#endif /* HAVE_JUMP_LABEL */ + +/* + * default period for cfs group bandwidth. + * default: 0.1s, units: nanoseconds + */ +static inline u64 default_cfs_period(void) +{ + return 100000000ULL; +} + +static inline u64 sched_cfs_bandwidth_slice(void) +{ + return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; +} + +/* + * Replenish runtime according to assigned quota and update expiration time. + * We use sched_clock_cpu directly instead of rq->clock to avoid adding + * additional synchronization around rq->lock. + * + * requires cfs_b->lock + */ +void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) +{ + u64 now; + + if (cfs_b->quota == RUNTIME_INF) + return; + + now = sched_clock_cpu(smp_processor_id()); + cfs_b->runtime = cfs_b->quota; + cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); +} + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return &tg->cfs_bandwidth; +} + +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task; + + return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; +} + +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + u64 amount = 0, min_amount, expires; + + /* note: this is a positive sum as runtime_remaining <= 0 */ + min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota == RUNTIME_INF) + amount = min_amount; + else { + /* + * If the bandwidth pool has become inactive, then at least one + * period must have elapsed since the last consumption. + * Refresh the global state and ensure bandwidth timer becomes + * active. + */ + if (!cfs_b->timer_active) { + __refill_cfs_bandwidth_runtime(cfs_b); + __start_cfs_bandwidth(cfs_b); + } + + if (cfs_b->runtime > 0) { + amount = min(cfs_b->runtime, min_amount); + cfs_b->runtime -= amount; + cfs_b->idle = 0; + } + } + expires = cfs_b->runtime_expires; + raw_spin_unlock(&cfs_b->lock); + + cfs_rq->runtime_remaining += amount; + /* + * we may have advanced our local expiration to account for allowed + * spread between our sched_clock and the one on which runtime was + * issued. + */ + if ((s64)(expires - cfs_rq->runtime_expires) > 0) + cfs_rq->runtime_expires = expires; + + return cfs_rq->runtime_remaining > 0; +} + +/* + * Note: This depends on the synchronization provided by sched_clock and the + * fact that rq->clock snapshots this value. + */ +static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct rq *rq = rq_of(cfs_rq); + + /* if the deadline is ahead of our clock, nothing to do */ + if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) + return; + + if (cfs_rq->runtime_remaining < 0) + return; + + /* + * If the local deadline has passed we have to consider the + * possibility that our sched_clock is 'fast' and the global deadline + * has not truly expired. + * + * Fortunately we can check determine whether this the case by checking + * whether the global deadline has advanced. + */ + + if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + /* extend local deadline, drift is bounded above by 2 ticks */ + cfs_rq->runtime_expires += TICK_NSEC; + } else { + /* global deadline is ahead, expiration has passed */ + cfs_rq->runtime_remaining = 0; + } +} + +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) +{ + /* dock delta_exec before expiring quota (as it could span periods) */ + cfs_rq->runtime_remaining -= delta_exec; + expire_cfs_rq_runtime(cfs_rq); + + if (likely(cfs_rq->runtime_remaining > 0)) + return; + + /* + * if we're unable to extend our runtime we resched so that the active + * hierarchy can be throttled + */ + if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) + resched_task(rq_of(cfs_rq)->curr); +} + +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) +{ + if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) + return; + + __account_cfs_rq_runtime(cfs_rq, delta_exec); +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->throttled; +} + +/* check whether cfs_rq, or any parent, is throttled */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return cfs_bandwidth_used() && cfs_rq->throttle_count; +} + +/* + * Ensure that neither of the group entities corresponding to src_cpu or + * dest_cpu are members of a throttled hierarchy when performing group + * load-balance operations. + */ +static inline int throttled_lb_pair(struct task_group *tg, + int src_cpu, int dest_cpu) +{ + struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + + src_cfs_rq = tg->cfs_rq[src_cpu]; + dest_cfs_rq = tg->cfs_rq[dest_cpu]; + + return throttled_hierarchy(src_cfs_rq) || + throttled_hierarchy(dest_cfs_rq); +} + +/* updated child weight may affect parent so we have to do this bottom up */ +static int tg_unthrottle_up(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count--; +#ifdef CONFIG_SMP + if (!cfs_rq->throttle_count) { + /* adjust cfs_rq_clock_task() */ + cfs_rq->throttled_clock_task_time += rq->clock_task - + cfs_rq->throttled_clock_task; + } +#endif + + return 0; +} + +static int tg_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + /* group is entering throttled state, stop time */ + if (!cfs_rq->throttle_count) + cfs_rq->throttled_clock_task = rq->clock_task; + cfs_rq->throttle_count++; + + return 0; +} + +static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + long task_delta, dequeue = 1; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + /* freeze hierarchy runnable averages while throttled */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); + + task_delta = cfs_rq->h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + break; + + if (dequeue) + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + qcfs_rq->h_nr_running -= task_delta; + + if (qcfs_rq->load.weight) + dequeue = 0; + } + + if (!se) + rq->nr_running -= task_delta; + + cfs_rq->throttled = 1; + cfs_rq->throttled_clock = rq->clock; + raw_spin_lock(&cfs_b->lock); + list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (!cfs_b->timer_active) + __start_cfs_bandwidth(cfs_b); + raw_spin_unlock(&cfs_b->lock); +} + +void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + int enqueue = 1; + long task_delta; + + se = cfs_rq->tg->se[cpu_of(rq)]; + + cfs_rq->throttled = 0; + raw_spin_lock(&cfs_b->lock); + cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; + list_del_rcu(&cfs_rq->throttled_list); + raw_spin_unlock(&cfs_b->lock); + + update_rq_clock(rq); + /* update hierarchical throttle state */ + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + + if (!cfs_rq->load.weight) + return; + + task_delta = cfs_rq->h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + enqueue = 0; + + cfs_rq = cfs_rq_of(se); + if (enqueue) + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + + if (cfs_rq_throttled(cfs_rq)) + break; + } + + if (!se) + rq->nr_running += task_delta; + + /* determine whether we need to wake up potentially idle cpu */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_task(rq->curr); +} + +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, + u64 remaining, u64 expires) +{ + struct cfs_rq *cfs_rq; + u64 runtime = remaining; + + rcu_read_lock(); + list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, + throttled_list) { + struct rq *rq = rq_of(cfs_rq); + + raw_spin_lock(&rq->lock); + if (!cfs_rq_throttled(cfs_rq)) + goto next; + + runtime = -cfs_rq->runtime_remaining + 1; + if (runtime > remaining) + runtime = remaining; + remaining -= runtime; + + cfs_rq->runtime_remaining += runtime; + cfs_rq->runtime_expires = expires; + + /* we check whether we're throttled above */ + if (cfs_rq->runtime_remaining > 0) + unthrottle_cfs_rq(cfs_rq); + +next: + raw_spin_unlock(&rq->lock); + + if (!remaining) + break; + } + rcu_read_unlock(); + + return remaining; +} + +/* + * Responsible for refilling a task_group's bandwidth and unthrottling its + * cfs_rqs as appropriate. If there has been no activity within the last + * period the timer is deactivated until scheduling resumes; cfs_b->idle is + * used to track this state. + */ +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +{ + u64 runtime, runtime_expires; + int idle = 1, throttled; + + raw_spin_lock(&cfs_b->lock); + /* no need to continue the timer with no bandwidth constraint */ + if (cfs_b->quota == RUNTIME_INF) + goto out_unlock; + + throttled = !list_empty(&cfs_b->throttled_cfs_rq); + /* idle depends on !throttled (for the case of a large deficit) */ + idle = cfs_b->idle && !throttled; + cfs_b->nr_periods += overrun; + + /* if we're going inactive then everything else can be deferred */ + if (idle) + goto out_unlock; + + /* + * if we have relooped after returning idle once, we need to update our + * status as actually running, so that other cpus doing + * __start_cfs_bandwidth will stop trying to cancel us. + */ + cfs_b->timer_active = 1; + + __refill_cfs_bandwidth_runtime(cfs_b); + + if (!throttled) { + /* mark as potentially idle for the upcoming period */ + cfs_b->idle = 1; + goto out_unlock; + } + + /* account preceding periods in which throttling occurred */ + cfs_b->nr_throttled += overrun; + + /* + * There are throttled entities so we must first use the new bandwidth + * to unthrottle them before making it generally available. This + * ensures that all existing debts will be paid before a new cfs_rq is + * allowed to run. + */ + runtime = cfs_b->runtime; + runtime_expires = cfs_b->runtime_expires; + cfs_b->runtime = 0; + + /* + * This check is repeated as we are holding onto the new bandwidth + * while we unthrottle. This can potentially race with an unthrottled + * group trying to acquire new bandwidth from the global pool. + */ + while (throttled && runtime > 0) { + raw_spin_unlock(&cfs_b->lock); + /* we can't nest cfs_b->lock while distributing bandwidth */ + runtime = distribute_cfs_runtime(cfs_b, runtime, + runtime_expires); + raw_spin_lock(&cfs_b->lock); + + throttled = !list_empty(&cfs_b->throttled_cfs_rq); + } + + /* return (any) remaining runtime */ + cfs_b->runtime = runtime; + /* + * While we are ensured activity in the period following an + * unthrottle, this also covers the case in which the new bandwidth is + * insufficient to cover the existing bandwidth deficit. (Forcing the + * timer to remain active while there are any throttled entities.) + */ + cfs_b->idle = 0; +out_unlock: + if (idle) + cfs_b->timer_active = 0; + raw_spin_unlock(&cfs_b->lock); + + return idle; +} + +/* a cfs_rq won't donate quota below this amount */ +static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; +/* minimum remaining period time to redistribute slack quota */ +static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; +/* how long we wait to gather additional slack before distributing */ +static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; + +/* + * Are we near the end of the current quota period? + * + * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the + * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * migrate_hrtimers, base is never cleared, so we are fine. + */ +static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) +{ + struct hrtimer *refresh_timer = &cfs_b->period_timer; + u64 remaining; + + /* if the call-back is running a quota refresh is already occurring */ + if (hrtimer_callback_running(refresh_timer)) + return 1; + + /* is a quota refresh about to occur? */ + remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); + if (remaining < min_expire) + return 1; + + return 0; +} + +static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) +{ + u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; + + /* if there's a quota refresh soon don't bother with slack */ + if (runtime_refresh_within(cfs_b, min_left)) + return; + + start_bandwidth_timer(&cfs_b->slack_timer, + ns_to_ktime(cfs_bandwidth_slack_period)); +} + +/* we know any runtime found here is valid as update_curr() precedes return */ +static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; + + if (slack_runtime <= 0) + return; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota != RUNTIME_INF && + cfs_rq->runtime_expires == cfs_b->runtime_expires) { + cfs_b->runtime += slack_runtime; + + /* we are under rq->lock, defer unthrottling using a timer */ + if (cfs_b->runtime > sched_cfs_bandwidth_slice() && + !list_empty(&cfs_b->throttled_cfs_rq)) + start_cfs_slack_bandwidth(cfs_b); + } + raw_spin_unlock(&cfs_b->lock); + + /* even if it's not valid for return we don't want to try again */ + cfs_rq->runtime_remaining -= slack_runtime; +} + +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + if (!cfs_bandwidth_used()) + return; + + if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + return; + + __return_cfs_rq_runtime(cfs_rq); +} + +/* + * This is done with a timer (instead of inline with bandwidth return) since + * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. + */ +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) +{ + u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + u64 expires; + + /* confirm we're still not at a refresh boundary */ + raw_spin_lock(&cfs_b->lock); + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { + raw_spin_unlock(&cfs_b->lock); + return; + } + + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { + runtime = cfs_b->runtime; + cfs_b->runtime = 0; + } + expires = cfs_b->runtime_expires; + raw_spin_unlock(&cfs_b->lock); + + if (!runtime) + return; + + runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + + raw_spin_lock(&cfs_b->lock); + if (expires == cfs_b->runtime_expires) + cfs_b->runtime = runtime; + raw_spin_unlock(&cfs_b->lock); +} + +/* + * When a group wakes up we want to make sure that its quota is not already + * expired/exceeded, otherwise it may be allowed to steal additional ticks of + * runtime as update_curr() throttling can not not trigger until it's on-rq. + */ +static void check_enqueue_throttle(struct cfs_rq *cfs_rq) +{ + if (!cfs_bandwidth_used()) + return; + + /* an active group must be handled by the update_curr()->put() path */ + if (!cfs_rq->runtime_enabled || cfs_rq->curr) + return; + + /* ensure the group is not already throttled */ + if (cfs_rq_throttled(cfs_rq)) + return; + + /* update runtime allocation */ + account_cfs_rq_runtime(cfs_rq, 0); + if (cfs_rq->runtime_remaining <= 0) + throttle_cfs_rq(cfs_rq); +} + +/* conditionally throttle active cfs_rq's from put_prev_entity() */ +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + if (!cfs_bandwidth_used()) + return; + + if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) + return; + + /* + * it's possible for a throttled entity to be forced into a running + * state (e.g. set_curr_task), in this case we're finished. + */ + if (cfs_rq_throttled(cfs_rq)) + return; + + throttle_cfs_rq(cfs_rq); +} + +static inline u64 default_cfs_period(void); +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); + +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); + + return HRTIMER_NORESTART; +} + +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, cfs_b->period); + + if (!overrun) + break; + + idle = do_sched_cfs_period_timer(cfs_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + raw_spin_lock_init(&cfs_b->lock); + cfs_b->runtime = 0; + cfs_b->quota = RUNTIME_INF; + cfs_b->period = ns_to_ktime(default_cfs_period()); + + INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->slack_timer.function = sched_cfs_slack_timer; +} + +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + cfs_rq->runtime_enabled = 0; + INIT_LIST_HEAD(&cfs_rq->throttled_list); +} + +/* requires cfs_b->lock, may release to reprogram timer */ +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + /* + * The timer may be active because we're trying to set a new bandwidth + * period or because we're racing with the tear-down path + * (timer_active==0 becomes visible before the hrtimer call-back + * terminates). In either case we ensure that it's re-programmed + */ + while (unlikely(hrtimer_active(&cfs_b->period_timer)) && + hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { + /* bounce the lock to allow do_sched_cfs_period_timer to run */ + raw_spin_unlock(&cfs_b->lock); + cpu_relax(); + raw_spin_lock(&cfs_b->lock); + /* if someone else restarted the timer then we're done */ + if (cfs_b->timer_active) + return; + } + + cfs_b->timer_active = 1; + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); +} + +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + hrtimer_cancel(&cfs_b->period_timer); + hrtimer_cancel(&cfs_b->slack_timer); +} + +static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + + if (!cfs_rq->runtime_enabled) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = cfs_b->quota; + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + } +} + +#else /* CONFIG_CFS_BANDWIDTH */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ + return rq_of(cfs_rq)->clock_task; +} + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static inline int throttled_lb_pair(struct task_group *tg, + int src_cpu, int dest_cpu) +{ + return 0; +} + +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +#endif + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return NULL; +} +static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} + +#endif /* CONFIG_CFS_BANDWIDTH */ + +/************************************************** + * CFS operations on tasks: + */ + +#ifdef CONFIG_SCHED_HRTICK +static void hrtick_start_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + WARN_ON(task_rq(p) != rq); + + if (cfs_rq->nr_running > 1) { + u64 slice = sched_slice(cfs_rq, se); + u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; + s64 delta = slice - ran; + + if (delta < 0) { + if (rq->curr == p) + resched_task(p); + return; + } + + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense. Rely on vruntime for fairness. + */ + if (rq->curr != p) + delta = max_t(s64, 10000LL, delta); + + hrtick_start(rq, delta); + } +} + +/* + * called from enqueue/dequeue and updates the hrtick when the + * current task is from our class and nr_running is low enough + * to matter. + */ +static void hrtick_update(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + + if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + return; + + if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) + hrtick_start_fair(rq, curr); +} +#else /* !CONFIG_SCHED_HRTICK */ +static inline void +hrtick_start_fair(struct rq *rq, struct task_struct *p) +{ +} + +static inline void hrtick_update(struct rq *rq) +{ +} +#endif + +#if defined(CONFIG_SCHED_HMP) || defined(CONFIG_MTK_SCHED_CMP) + +/* CPU cluster statistics for task migration control */ +#define HMP_GB (0x1000) +#define HMP_SELECT_RQ (0x2000) +#define HMP_LB (0x4000) +#define HMP_MAX_LOAD (NICE_0_LOAD - 1) + + +struct clb_env { + struct clb_stats bstats; + struct clb_stats lstats; + int btarget, ltarget; + + struct cpumask *bcpus; + struct cpumask *lcpus; + + unsigned int flags; + struct mcheck { + int status; /* Details of this migration check */ + int result; /* Indicate whether we should perform this task migration */ + } mcheck; +}; + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu); + +static void collect_cluster_stats(struct clb_stats *clbs, + struct cpumask *cluster_cpus, int target) +{ +#define HMP_RESOLUTION_SCALING (4) +#define hmp_scale_down(w) ((w) >> HMP_RESOLUTION_SCALING) + + /* Update cluster informatics */ + int cpu; + for_each_cpu(cpu, cluster_cpus) { + if(cpu_online(cpu)) { + clbs->ncpu ++; + clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running; + clbs->load_avg += cpu_rq(cpu)->cfs.avg.load_avg_ratio; +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + clbs->nr_normal_prio_task += cfs_nr_normal_prio(cpu); + clbs->nr_dequeuing_low_prio += cfs_nr_dequeuing_low_prio(cpu); +#endif + } + } + + if(!clbs->ncpu || NR_CPUS == target || !cpumask_test_cpu(target,cluster_cpus)) + return; + + clbs->cpu_power = (int) arch_scale_freq_power(NULL, target); + + /* Scale current CPU compute capacity in accordance with frequency */ + clbs->cpu_capacity = HMP_MAX_LOAD; +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + if (hmp_data.freqinvar_load_scale_enabled) { + cpu = cpumask_any(cluster_cpus); + if (freq_scale[cpu].throttling == 1){ + clbs->cpu_capacity *= freq_scale[cpu].curr_scale; + }else { + clbs->cpu_capacity *= freq_scale[cpu].max; + } + clbs->cpu_capacity >>= SCHED_FREQSCALE_SHIFT; + + if (clbs->cpu_capacity > HMP_MAX_LOAD){ + clbs->cpu_capacity = HMP_MAX_LOAD; + } + } +#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY) + if (topology_cpu_inv_power_en()) { + cpu = cpumask_any(cluster_cpus); + if (topology_cpu_throttling(cpu)) + clbs->cpu_capacity *= + (topology_cpu_capacity(cpu) << CPUPOWER_FREQSCALE_SHIFT) + / (topology_max_cpu_capacity(cpu)+1); + else + clbs->cpu_capacity *= topology_max_cpu_capacity(cpu); + clbs->cpu_capacity >>= CPUPOWER_FREQSCALE_SHIFT; + + if (clbs->cpu_capacity > HMP_MAX_LOAD){ + clbs->cpu_capacity = HMP_MAX_LOAD; + } + } +#endif + + /* + * Calculate available CPU capacity + * Calculate available task space + * + * Why load ratio should be multiplied by the number of task ? + * The task is the entity of scheduling unit so that we should consider + * it in scheduler. Only considering task load is not enough. + * Thus, multiplying the number of tasks can adjust load ratio to a more + * reasonable value. + */ + clbs->load_avg /= clbs->ncpu; + clbs->acap = clbs->cpu_capacity - cpu_rq(target)->cfs.avg.load_avg_ratio; + clbs->scaled_acap = hmp_scale_down(clbs->acap); + clbs->scaled_atask = cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.load_avg_ratio; + clbs->scaled_atask = clbs->cpu_capacity - clbs->scaled_atask; + clbs->scaled_atask = hmp_scale_down(clbs->scaled_atask); + + mt_sched_printf(sched_log, "[%s] cpu/cluster:%d/%02lx load/len:%lu/%u stats:%d,%d,%d,%d,%d,%d,%d,%d\n", __func__, + target, *cpumask_bits(cluster_cpus), + cpu_rq(target)->cfs.avg.load_avg_ratio, cpu_rq(target)->cfs.h_nr_running, + clbs->ncpu, clbs->ntask, clbs->load_avg, clbs->cpu_capacity, + clbs->acap, clbs->scaled_acap, clbs->scaled_atask, clbs->threshold); +} + +//#define USE_HMP_DYNAMIC_THRESHOLD +#if defined(CONFIG_SCHED_HMP) && defined(USE_HMP_DYNAMIC_THRESHOLD) +static inline void hmp_dynamic_threshold(struct clb_env *clbenv); +#endif + +/* + * Task Dynamic Migration Threshold Adjustment. + * + * If the workload between clusters is not balanced, adjust migration + * threshold in an attempt to move task precisely. + * + * Diff. = Max Threshold - Min Threshold + * + * Dynamic UP-Threshold = + * B_nacap B_natask + * Max Threshold - Diff. x ----------------- x ------------------- + * B_nacap + L_nacap B_natask + L_natask + * + * + * Dynamic Down-Threshold = + * L_nacap L_natask + * Min Threshold + Diff. x ----------------- x ------------------- + * B_nacap + L_nacap B_natask + L_natask + */ +static void adj_threshold(struct clb_env *clbenv) +{ +#define TSKLD_SHIFT (2) +#define POSITIVE(x) ((int)(x) < 0 ? 0 : (x)) + + int bcpu, lcpu; + unsigned long b_cap=0, l_cap=0; + unsigned long b_load=0, l_load=0; + unsigned long b_task=0, l_task=0; + int b_nacap, l_nacap, b_natask, l_natask; + +#if defined(CONFIG_SCHED_HMP) && defined(USE_HMP_DYNAMIC_THRESHOLD) + hmp_dynamic_threshold(clbenv); + return; +#endif + + bcpu = clbenv->btarget; + lcpu = clbenv->ltarget; + if (bcpu < nr_cpu_ids) { + b_load = cpu_rq(bcpu)->cfs.avg.load_avg_ratio; + b_task = cpu_rq(bcpu)->cfs.h_nr_running; + } + if (lcpu < nr_cpu_ids) { + l_load = cpu_rq(lcpu)->cfs.avg.load_avg_ratio; + l_task = cpu_rq(lcpu)->cfs.h_nr_running; + } + +#ifdef CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY + if (bcpu < nr_cpu_ids) { + b_cap = topology_cpu_capacity(bcpu); + } + if (lcpu < nr_cpu_ids) { + l_cap = topology_cpu_capacity(lcpu); + } + + b_nacap = POSITIVE(b_cap - b_load); + b_natask = POSITIVE(b_cap - ((b_task * b_load) >> TSKLD_SHIFT)); + l_nacap = POSITIVE(l_cap - l_load); + l_natask = POSITIVE(l_cap - ((l_task * l_load) >> TSKLD_SHIFT)); +#else /* !CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY */ + b_cap = clbenv->bstats.cpu_power; + l_cap = clbenv->lstats.cpu_power; + b_nacap = POSITIVE(clbenv->bstats.scaled_acap * + clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1)); + b_natask = POSITIVE(clbenv->bstats.scaled_atask * + clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1)); + l_nacap = POSITIVE(clbenv->lstats.scaled_acap); + l_natask = POSITIVE(clbenv->bstats.scaled_atask); + +#endif /* CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY */ + + clbenv->bstats.threshold = HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask / + ((b_nacap + l_nacap) * (b_natask + l_natask)+1); + clbenv->lstats.threshold = HMP_MAX_LOAD * l_nacap * l_natask / + ((b_nacap + l_nacap) * (b_natask + l_natask)+1); + + mt_sched_printf(sched_log, "[%s]\tup/dl:%4d/%4d L(%d:%4lu,%4lu/%4lu) b(%d:%4lu,%4lu/%4lu)\n", __func__, + clbenv->bstats.threshold, clbenv->lstats.threshold, + lcpu, l_load, l_task, l_cap, + bcpu, b_load, b_task, b_cap); +} + +static void sched_update_clbstats(struct clb_env *clbenv) +{ + collect_cluster_stats(&clbenv->bstats, clbenv->bcpus, clbenv->btarget); + collect_cluster_stats(&clbenv->lstats, clbenv->lcpus, clbenv->ltarget); + adj_threshold(clbenv); +} +#endif /* #if defined(CONFIG_SCHED_HMP) || defined(CONFIG_SCHED_CMP) */ + + +#ifdef CONFIG_SCHED_HMP +/* + * Heterogenous multiprocessor (HMP) optimizations + * + * The cpu types are distinguished using a list of hmp_domains + * which each represent one cpu type using a cpumask. + * The list is assumed ordered by compute capacity with the + * fastest domain first. + */ +DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain); +/* We need to know which cpus are fast and slow. */ +extern struct cpumask hmp_fast_cpu_mask; +extern struct cpumask hmp_slow_cpu_mask; + +extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list); + +/* Setup hmp_domains */ +static int __init hmp_cpu_mask_setup(void) +{ + char buf[64]; + struct hmp_domain *domain; + struct list_head *pos; + int dc, cpu; + +#if defined(CONFIG_SCHED_HMP_ENHANCEMENT) || defined(CONFIG_MT_RT_SCHED) + cpumask_clear(&hmp_fast_cpu_mask); + cpumask_clear(&hmp_slow_cpu_mask); +#endif + + pr_debug("Initializing HMP scheduler:\n"); + + /* Initialize hmp_domains using platform code */ + arch_get_hmp_domains(&hmp_domains); + if (list_empty(&hmp_domains)) { + pr_debug("HMP domain list is empty!\n"); + return 0; + } + + /* Print hmp_domains */ + dc = 0; + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + cpulist_scnprintf(buf, 64, &domain->possible_cpus); + pr_debug(" HMP domain %d: %s\n", dc, buf); + + /* + * According to the description in "arch_get_hmp_domains", + * Fastest domain is at head of list. Thus, the fast-cpu mask should + * be initialized first, followed by slow-cpu mask. + */ +#if defined(CONFIG_SCHED_HMP_ENHANCEMENT) defined(CONFIG_MT_RT_SCHED) + if(cpumask_empty(&hmp_fast_cpu_mask)) { + cpumask_copy(&hmp_fast_cpu_mask,&domain->possible_cpus); + for_each_cpu(cpu, &hmp_fast_cpu_mask) + pr_debug(" HMP fast cpu : %d\n",cpu); + } else if (cpumask_empty(&hmp_slow_cpu_mask)){ + cpumask_copy(&hmp_slow_cpu_mask,&domain->possible_cpus); + for_each_cpu(cpu, &hmp_slow_cpu_mask) + pr_debug(" HMP slow cpu : %d\n",cpu); + } +#endif + + for_each_cpu_mask(cpu, domain->possible_cpus) { + per_cpu(hmp_cpu_domain, cpu) = domain; + } + dc++; + } + + return 1; +} + +static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu) +{ + struct hmp_domain *domain; + struct list_head *pos; + + list_for_each(pos, &hmp_domains) { + domain = list_entry(pos, struct hmp_domain, hmp_domains); + if(cpumask_test_cpu(cpu, &domain->possible_cpus)) + return domain; + } + return NULL; +} + +static void hmp_online_cpu(int cpu) +{ + struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu); + + if(domain) + cpumask_set_cpu(cpu, &domain->cpus); +} + +static void hmp_offline_cpu(int cpu) +{ + struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu); + + if(domain) + cpumask_clear_cpu(cpu, &domain->cpus); +} + +/* + * Migration thresholds should be in the range [0..1023] + * hmp_up_threshold: min. load required for migrating tasks to a faster cpu + * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu + * The default values (512, 256) offer good responsiveness, but may need + * tweaking suit particular needs. + * + * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio) + * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms) + * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms) + */ +#ifdef CONFIG_HMP_DYNAMIC_THRESHOLD +unsigned int hmp_up_threshold = 1023; +unsigned int hmp_down_threshold = 0; +#else +unsigned int hmp_up_threshold = 512; +unsigned int hmp_down_threshold = 256; +#endif + +unsigned int hmp_next_up_threshold = 4096; +unsigned int hmp_next_down_threshold = 4096; +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT +#define hmp_last_up_migration(cpu) \ + cpu_rq(cpu)->cfs.avg.hmp_last_up_migration +#define hmp_last_down_migration(cpu) \ + cpu_rq(cpu)->cfs.avg.hmp_last_down_migration +static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p, + int prev_cpu, int new_cpu); +#else +static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se); +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se); +#endif +static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, + int *min_cpu); + +/* Check if cpu is in fastest hmp_domain */ +static inline unsigned int hmp_cpu_is_fastest(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return pos == hmp_domains.next; +} + +/* Check if cpu is in slowest hmp_domain */ +static inline unsigned int hmp_cpu_is_slowest(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_is_last(pos, &hmp_domains); +} + +/* Next (slower) hmp_domain relative to cpu */ +static inline struct hmp_domain *hmp_slower_domain(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_entry(pos->next, struct hmp_domain, hmp_domains); +} + +/* Previous (faster) hmp_domain relative to cpu */ +static inline struct hmp_domain *hmp_faster_domain(int cpu) +{ + struct list_head *pos; + + pos = &hmp_cpu_domain(cpu)->hmp_domains; + return list_entry(pos->prev, struct hmp_domain, hmp_domains); +} + +/* + * Selects a cpu in previous (faster) hmp_domain + * Note that cpumask_any_and() returns the first cpu in the cpumask + */ +static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk, + int cpu) +{ + int lowest_cpu=NR_CPUS; + __always_unused int lowest_ratio = hmp_domain_min_load(hmp_faster_domain(cpu), &lowest_cpu); + /* + * If the lowest-loaded CPU in the domain is allowed by the task affinity + * select that one, otherwise select one which is allowed + */ + if(lowest_cpu < nr_cpu_ids && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk))) + return lowest_cpu; + else + return cpumask_any_and(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(tsk)); +} + +/* + * Selects a cpu in next (slower) hmp_domain + * Note that cpumask_any_and() returns the first cpu in the cpumask + */ +static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk, + int cpu) +{ + int lowest_cpu=NR_CPUS; + __always_unused int lowest_ratio = hmp_domain_min_load(hmp_slower_domain(cpu), &lowest_cpu); + /* + * If the lowest-loaded CPU in the domain is allowed by the task affinity + * select that one, otherwise select one which is allowed + */ + if(lowest_cpu < nr_cpu_ids && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk))) + return lowest_cpu; + else + return cpumask_any_and(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(tsk)); +} + +static inline void hmp_next_up_delay(struct sched_entity *se, int cpu) +{ +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + hmp_last_up_migration(cpu) = cfs_rq_clock_task(cfs_rq); + hmp_last_down_migration(cpu) = 0; +#else + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + + se->avg.hmp_last_up_migration = cfs_rq_clock_task(cfs_rq); + se->avg.hmp_last_down_migration = 0; +#endif +} + +static inline void hmp_next_down_delay(struct sched_entity *se, int cpu) +{ +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + hmp_last_down_migration(cpu) = cfs_rq_clock_task(cfs_rq); + hmp_last_up_migration(cpu) = 0; +#else + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + + se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq); + se->avg.hmp_last_up_migration = 0; +#endif +} + +#ifdef CONFIG_HMP_VARIABLE_SCALE +/* + * Heterogenous multiprocessor (HMP) optimizations + * + * These functions allow to change the growing speed of the load_avg_ratio + * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms + * This can now be changed with /sys/kernel/hmp/load_avg_period_ms. + * + * These functions also allow to change the up and down threshold of HMP + * using /sys/kernel/hmp/{up,down}_threshold. + * Both must be between 0 and 1023. The threshold that is compared + * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024. + * + * For instance, if load_avg_period = 64 and up_threshold = 512, an idle + * task with a load of 0 will reach the threshold after 64ms of busy loop. + * + * Changing load_avg_periods_ms has the same effect than changing the + * default scaling factor Y=1002/1024 in the load_avg_ratio computation to + * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one + * could trigger overflows. + * For instance, with Y = 1023/1024 in __update_task_entity_contrib() + * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);" + * could be overflowed for a weight > 2^12 even is the load_avg_contrib + * should still be a 32bits result. This would not happen by multiplicating + * delta time by 1/22 and setting load_avg_period_ms = 706. + */ + +/* + * By scaling the delta time it end-up increasing or decrease the + * growing speed of the per entity load_avg_ratio + * The scale factor hmp_data.multiplier is a fixed point + * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT + */ +static u64 hmp_variable_scale_convert(u64 delta) +{ + u64 high = delta >> 32ULL; + u64 low = delta & 0xffffffffULL; + low *= hmp_data.multiplier; + high *= hmp_data.multiplier; + return (low >> HMP_VARIABLE_SCALE_SHIFT) + + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT)); +} + +static ssize_t hmp_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + ssize_t ret = 0; + struct hmp_global_attr *hmp_attr = + container_of(attr, struct hmp_global_attr, attr); + int temp = *(hmp_attr->value); + if (hmp_attr->to_sysfs != NULL) + temp = hmp_attr->to_sysfs(temp); + ret = sprintf(buf, "%d\n", temp); + return ret; +} + +static ssize_t hmp_store(struct kobject *a, struct attribute *attr, + const char *buf, size_t count) +{ + int temp; + ssize_t ret = count; + struct hmp_global_attr *hmp_attr = + container_of(attr, struct hmp_global_attr, attr); + char *str = vmalloc(count + 1); + if (str == NULL) + return -ENOMEM; + memcpy(str, buf, count); + str[count] = 0; + if (sscanf(str, "%d", &temp) < 1) + ret = -EINVAL; + else { + if (hmp_attr->from_sysfs != NULL) + temp = hmp_attr->from_sysfs(temp); + if (temp < 0) + ret = -EINVAL; + else + *(hmp_attr->value) = temp; + } + vfree(str); + return ret; +} + +static int hmp_period_tofrom_sysfs(int value) +{ + return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value; +} + +/* max value for threshold is 1024 */ +static int hmp_theshold_from_sysfs(int value) +{ + if (value > 1024) + return -1; + return value; +} +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +/* freqinvar control is only 0,1 off/on */ +static int hmp_freqinvar_from_sysfs(int value) +{ + if (value < 0 || value > 1) + return -1; + return value; +} +#endif +static void hmp_attr_add( + const char *name, + int *value, + int (*to_sysfs)(int), + int (*from_sysfs)(int)) +{ + int i = 0; + while (hmp_data.attributes[i] != NULL) { + i++; + if (i >= HMP_DATA_SYSFS_MAX) + return; + } + hmp_data.attr[i].attr.mode = 0644; + hmp_data.attr[i].show = hmp_show; + hmp_data.attr[i].store = hmp_store; + hmp_data.attr[i].attr.name = name; + hmp_data.attr[i].value = value; + hmp_data.attr[i].to_sysfs = to_sysfs; + hmp_data.attr[i].from_sysfs = from_sysfs; + hmp_data.attributes[i] = &hmp_data.attr[i].attr; + hmp_data.attributes[i + 1] = NULL; +} + +static int hmp_attr_init(void) +{ + int ret; + memset(&hmp_data, sizeof(hmp_data), 0); + /* by default load_avg_period_ms == LOAD_AVG_PERIOD + * meaning no change + */ + /* LOAD_AVG_PERIOD is too short to trigger heavy task indicator + so we change it to LOAD_AVG_VARIABLE_PERIOD */ + hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_VARIABLE_PERIOD); + + hmp_attr_add("load_avg_period_ms", + &hmp_data.multiplier, + hmp_period_tofrom_sysfs, + hmp_period_tofrom_sysfs); + hmp_attr_add("up_threshold", + &hmp_up_threshold, + NULL, + hmp_theshold_from_sysfs); + hmp_attr_add("down_threshold", + &hmp_down_threshold, + NULL, + hmp_theshold_from_sysfs); + hmp_attr_add("init_task_load_period", + &init_task_load_period, + NULL, + NULL); +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + /* default frequency-invariant scaling ON */ + hmp_data.freqinvar_load_scale_enabled = 1; + hmp_attr_add("frequency_invariant_load_scale", + &hmp_data.freqinvar_load_scale_enabled, + NULL, + hmp_freqinvar_from_sysfs); +#endif + hmp_data.attr_group.name = "hmp"; + hmp_data.attr_group.attrs = hmp_data.attributes; + ret = sysfs_create_group(kernel_kobj, + &hmp_data.attr_group); + return 0; +} +late_initcall(hmp_attr_init); +#endif /* CONFIG_HMP_VARIABLE_SCALE */ + +static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd, + int *min_cpu) +{ + int cpu; + int min_cpu_runnable_temp = NR_CPUS; + unsigned long min_runnable_load = INT_MAX; + unsigned long contrib; + + for_each_cpu_mask(cpu, hmpd->cpus) { + /* don't use the divisor in the loop, just at the end */ + contrib = cpu_rq(cpu)->avg.runnable_avg_sum * scale_load_down(1024); + if (contrib < min_runnable_load) { + min_runnable_load = contrib; + min_cpu_runnable_temp = cpu; + } + } + + if (min_cpu) + *min_cpu = min_cpu_runnable_temp; + + /* domain will often have at least one empty CPU */ + return min_runnable_load ? min_runnable_load / (LOAD_AVG_MAX + 1) : 0; +} + +/* + * Calculate the task starvation + * This is the ratio of actually running time vs. runnable time. + * If the two are equal the task is getting the cpu time it needs or + * it is alone on the cpu and the cpu is fully utilized. + */ +static inline unsigned int hmp_task_starvation(struct sched_entity *se) +{ + u32 starvation; + + starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD); + starvation /= (se->avg.runnable_avg_sum + 1); + + return scale_load(starvation); +} + +static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se) +{ + int min_usage; + int dest_cpu = NR_CPUS; + + if (hmp_cpu_is_slowest(cpu)) + return NR_CPUS; + + /* Is the current domain fully loaded? */ + /* load < ~50% */ + min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL); + if (min_usage < (NICE_0_LOAD>>1)) + return NR_CPUS; + + /* Is the task alone on the cpu? */ + if (cpu_rq(cpu)->cfs.nr_running < 2) + return NR_CPUS; + + /* Is the task actually starving? */ + /* >=25% ratio running/runnable = starving */ + if (hmp_task_starvation(se) > 768) + return NR_CPUS; + + /* Does the slower domain have spare cycles? */ + min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu); + /* load > 50% */ + if (min_usage > NICE_0_LOAD/2) + return NR_CPUS; + + if (cpumask_test_cpu(dest_cpu, &hmp_slower_domain(cpu)->cpus)) + return dest_cpu; + + return NR_CPUS; +} +#endif /* CONFIG_SCHED_HMP */ + + +#ifdef CONFIG_MTK_SCHED_CMP +/* Check if cpu is in fastest hmp_domain */ +unsigned int cmp_up_threshold = 512; +unsigned int cmp_down_threshold = 256; +#endif /* CONFIG_MTK_SCHED_CMP */ + +#ifdef CONFIG_MTK_SCHED_CMP_TGS +static void sched_tg_enqueue_fair(struct rq *rq, struct task_struct *p) +{ + int id; + unsigned long flags; + struct task_struct *tg = p->group_leader; + + if (group_leader_is_empty(p)) + return; + id = get_cluster_id(rq->cpu); + if (unlikely(WARN_ON(id < 0))) + return; + + raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags); + tg->thread_group_info[id].cfs_nr_running++; + raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags); +} + +static void sched_tg_dequeue_fair(struct rq *rq, struct task_struct *p) +{ + int id; + unsigned long flags; + struct task_struct *tg = p->group_leader; + + if (group_leader_is_empty(p)) + return; + id = get_cluster_id(rq->cpu); + if (unlikely(WARN_ON(id < 0))) + return; + + raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags); + tg->thread_group_info[id].cfs_nr_running--; + raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags); +} + +#endif +/* + * The enqueue_task method is called before nr_running is + * increased. Here we update the fair scheduling stats and + * then put the task into the rbtree: + */ +static void +enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + for_each_sched_entity(se) { + if (se->on_rq) + break; + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, flags); + + /* + * end evaluation on encountering a throttled cfs_rq + * + * note: in the case of encountering a throttled cfs_rq we will + * post the final h_nr_running increment below. + */ + if (cfs_rq_throttled(cfs_rq)) + break; + cfs_rq->h_nr_running++; + + flags = ENQUEUE_WAKEUP; + } + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_running++; + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); + } + + if (!se) { + update_rq_runnable_avg(rq, rq->nr_running); + inc_nr_running(rq); +#ifndef CONFIG_CFS_BANDWIDTH + BUG_ON(rq->cfs.nr_running > rq->cfs.h_nr_running); +#endif + } + hrtick_update(rq); +#ifdef CONFIG_HMP_TRACER + trace_sched_runqueue_length(rq->cpu,rq->nr_running); + trace_sched_cfs_length(rq->cpu,rq->cfs.h_nr_running); +#endif +#ifdef CONFIG_MET_SCHED_HMP + RqLen(rq->cpu,rq->nr_running); + CfsLen(rq->cpu,rq->cfs.h_nr_running); +#endif + +#ifdef CONFIG_MTK_SCHED_CMP_TGS + sched_tg_enqueue_fair(rq, p); +#endif +} + +static void set_next_buddy(struct sched_entity *se); + +/* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and + * update the fair scheduling stats: + */ +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + int task_sleep = flags & DEQUEUE_SLEEP; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + dequeue_entity(cfs_rq, se, flags); + + /* + * end evaluation on encountering a throttled cfs_rq + * + * note: in the case of encountering a throttled cfs_rq we will + * post the final h_nr_running decrement below. + */ + if (cfs_rq_throttled(cfs_rq)) + break; + cfs_rq->h_nr_running--; + + /* Don't dequeue parent if it has other entities besides us */ + if (cfs_rq->load.weight) { + /* + * Bias pick_next to pick a task from this cfs_rq, as + * p is sleeping when it is within its sched_slice. + */ + if (task_sleep && parent_entity(se)) + set_next_buddy(parent_entity(se)); + + /* avoid re-evaluating load for this entity */ + se = parent_entity(se); + break; + } + flags |= DEQUEUE_SLEEP; + } + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_running--; + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); + } + + if (!se) { + dec_nr_running(rq); +#ifndef CONFIG_CFS_BANDWIDTH + BUG_ON(rq->cfs.nr_running > rq->cfs.h_nr_running); +#endif + update_rq_runnable_avg(rq, 1); + } + hrtick_update(rq); +#ifdef CONFIG_HMP_TRACER + trace_sched_runqueue_length(rq->cpu,rq->nr_running); + trace_sched_cfs_length(rq->cpu,rq->cfs.h_nr_running); +#endif +#ifdef CONFIG_MET_SCHED_HMP + RqLen(rq->cpu,rq->nr_running); + CfsLen(rq->cpu,rq->cfs.h_nr_running); +#endif + +#ifdef CONFIG_MTK_SCHED_CMP_TGS + sched_tg_dequeue_fair(rq, p); +#endif +} + +#ifdef CONFIG_SMP +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->cfs.runnable_load_avg; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +static unsigned long power_of(int cpu) +{ + return cpu_rq(cpu)->cpu_power; +} + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long load_avg = rq->cfs.runnable_load_avg; + + if (nr_running) + return load_avg / nr_running; + + return 0; +} + + +static void task_waking_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; + +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; + + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * effective_load() calculates the load change as seen from the root_task_group + * + * Adding load to a group doesn't make a group heavier, but can cause movement + * of group shares between cpus. Assuming the shares were perfectly aligned one + * can calculate the shift in shares. + * + * Calculate the effective load difference if @wl is added (subtracted) to @tg + * on this @cpu and results in a total addition (subtraction) of @wg to the + * total group weight. + * + * Given a runqueue weight distribution (rw_i) we can compute a shares + * distribution (s_i) using: + * + * s_i = rw_i / \Sum rw_j (1) + * + * Suppose we have 4 CPUs and our @tg is a direct child of the root group and + * has 7 equal weight tasks, distributed as below (rw_i), with the resulting + * shares distribution (s_i): + * + * rw_i = { 2, 4, 1, 0 } + * s_i = { 2/7, 4/7, 1/7, 0 } + * + * As per wake_affine() we're interested in the load of two CPUs (the CPU the + * task used to run on and the CPU the waker is running on), we need to + * compute the effect of waking a task on either CPU and, in case of a sync + * wakeup, compute the effect of the current task going to sleep. + * + * So for a change of @wl to the local @cpu with an overall group weight change + * of @wl we can compute the new shares distribution (s'_i) using: + * + * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) + * + * Suppose we're interested in CPUs 0 and 1, and want to compute the load + * differences in waking a task to CPU 0. The additional task changes the + * weight and shares distributions like: + * + * rw'_i = { 3, 4, 1, 0 } + * s'_i = { 3/8, 4/8, 1/8, 0 } + * + * We can then compute the difference in effective weight by using: + * + * dw_i = S * (s'_i - s_i) (3) + * + * Where 'S' is the group weight as seen by its parent. + * + * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) + * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - + * 4/7) times the weight of the group. + */ +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) +{ + struct sched_entity *se = tg->se[cpu]; + + if (!tg->parent) /* the trivial, non-cgroup case */ + return wl; + + for_each_sched_entity(se) { + long w, W; + + tg = se->my_q->tg; + + /* + * W = @wg + \Sum rw_j + */ + W = wg + calc_tg_weight(tg, se->my_q); + + /* + * w = rw_i + @wl + */ + w = se->my_q->load.weight + wl; + + /* + * wl = S * s'_i; see (2) + */ + if (W > 0 && w < W) + wl = (w * tg->shares) / W; + else + wl = tg->shares; + + /* + * Per the above, wl is the new se->load.weight value; since + * those are clipped to [MIN_SHARES, ...) do so now. See + * calc_cfs_shares(). + */ + if (wl < MIN_SHARES) + wl = MIN_SHARES; + + /* + * wl = dw_i = S * (s'_i - s_i); see (3) + */ + wl -= se->load.weight; + + /* + * Recursively apply this logic to all parent groups to compute + * the final effective load change on the root group. Since + * only the @tg group gets extra weight, all parent groups can + * only redistribute existing shares. @wl is the shift in shares + * resulting from this level per the above. + */ + wg = 0; + } + + return wl; +} +#else + +static inline unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) +{ + return wl; +} + +#endif + +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +{ + s64 this_load, load; + int idx, this_cpu, prev_cpu; + unsigned long tl_per_task; + struct task_group *tg; + unsigned long weight; + int balanced; + + idx = sd->wake_idx; + this_cpu = smp_processor_id(); + prev_cpu = task_cpu(p); + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) { + tg = task_group(current); + weight = current->se.load.weight; + + this_load += effective_load(tg, this_cpu, -weight, -weight); + load += effective_load(tg, prev_cpu, 0, -weight); + } + + tg = task_group(p); + weight = p->se.load.weight; + + /* + * In low-load situations, where prev_cpu is idle and this_cpu is idle + * due to the sync cause above having dropped this_load to 0, we'll + * always have an imbalance, but there's really nothing you can do + * about that, so that's good too. + * + * Otherwise check if either cpus are near enough in load to allow this + * task to be woken on this_cpu. + */ + if (this_load > 0) { + s64 this_eff_load, prev_eff_load; + + this_eff_load = 100; + this_eff_load *= power_of(prev_cpu); + this_eff_load *= this_load + + effective_load(tg, this_cpu, weight, weight); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= power_of(this_cpu); + prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + + balanced = this_eff_load <= prev_eff_load; + } else + balanced = true; + + /* + * If the currently running task will sleep within + * a reasonable amount of time then attract this newly + * woken task: + */ + if (sync && balanced) + return 1; + + schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + if (balanced || + (this_load <= load && + this_load + target_load(prev_cpu, idx) <= tl_per_task)) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(sd, ttwu_move_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); + + return 1; + } + return 0; +} + +#ifdef CONFIG_MT_SCHED_INTEROP +#define MT_RT_LOAD (2*1023*scale_load_down(scale_load(prio_to_weight[0]))) +static inline unsigned long mt_rt_load(int cpu){ + return cpu_rq(cpu)->rt.rt_nr_running * MT_RT_LOAD; +} +#endif +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int load_idx) +{ + struct sched_group *idlest = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int imbalance = 100 + (sd->imbalance_pct-100)/2; + + do { + unsigned long load, avg_load; + int local_group; + int i; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_cpus(group), + tsk_cpus_allowed(p))) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu(i, sched_group_cpus(group)) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + +#ifdef CONFIG_MT_SCHED_INTEROP + load += mt_rt_load(i); +#endif + avg_load += load; + + mt_sched_printf(sched_log, "find_idlest_group cpu=%d avg=%lu", + i, avg_load); + } + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + + if (local_group) { + this_load = avg_load; + mt_sched_printf(sched_log, "find_idlest_group this_load=%lu", + this_load); + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + mt_sched_printf(sched_log, "find_idlest_group min_load=%lu", + min_load); + } + } while (group = group->next, group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load){ + mt_sched_printf(sched_log, "find_idlest_group fail this_load=%lu min_load=%lu, imbalance=%d", + this_load, min_load, imbalance); + return NULL; + } + return idlest; +} + +/* + * find_idlest_cpu - find the idlest cpu among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + /* Traverse only the allowed CPUs */ + for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { + load = weighted_cpuload(i); + +#ifdef CONFIG_MT_SCHED_INTEROP + load += mt_rt_load(i); +#endif + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; + } + } + + return idlest; +} + +/* + * Try and locate an idle CPU in the sched_domain. + */ +static int select_idle_sibling(struct task_struct *p, int target) +{ + struct sched_domain *sd; + struct sched_group *sg; + int i = task_cpu(p); + + if (idle_cpu(target)) + return target; + + /* + * If the prevous cpu is cache affine and idle, don't be stupid. + */ + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; + + /* + * Otherwise, iterate the domains and find an elegible idle cpu. + */ + sd = rcu_dereference(per_cpu(sd_llc, target)); + for_each_lower_domain(sd) { + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; + + for_each_cpu(i, sched_group_cpus(sg)) { + if (i == target || !idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; +next: + sg = sg->next; + } while (sg != sd->groups); + } +done: + return target; +} + +#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP +/* + * @p: the task want to be located at. + * @clid: the CPU cluster id to be search for the target CPU + * @target: the appropriate CPU for task p, updated by this function. + * + * Return: + * + * 1 on success + * 0 if target CPU is not found in this CPU cluster + */ +static int cmp_find_idle_cpu(struct cpumask *cpus) +{ + int j; + + for_each_cpu(j, cpus) { + if (idle_cpu(j)) + return j; + } + + return -1; +} + +#if !defined(CONFIG_SCHED_HMP) +#define TGS_WAKEUP_EXPERIMENT +#endif +static int cmp_select_task_rq_fair(struct task_struct *p, int prev_cpu) +{ + int i, j; + int in_prev, prev_cluster; + int max_tg_cnt, tg_cnt; + int max_idle_cnt, idle_cnt; + struct cpumask max_idle_mask, idle_mask; + struct cpumask cls_cpus, allowed_mask; + int num_cluster; + + in_prev = 0; + max_tg_cnt = 0; + max_idle_cnt = 0; + cpumask_clear(&max_idle_mask); + + num_cluster = arch_get_nr_clusters(); + for(i = 0; i < num_cluster; i++) { + + cpumask_clear(&idle_mask); + get_cluster_cpus(&cls_cpus, i, true); + cpumask_and(&allowed_mask, &cls_cpus, tsk_cpus_allowed(p)); + + if(!cpumask_weight(&allowed_mask)) + continue; + +#ifdef TGS_WAKEUP_EXPERIMENT + if (arch_is_big_little()) { + int bcpu; + j = cpumask_any(&allowed_mask); + bcpu = arch_cpu_is_big(j); + if (bcpu && (p->se.avg.load_avg_ratio >= cmp_up_threshold)) { + mt_sched_printf(sched_cmp, "[heavy task] wakeup load=%ld up_th=%u pid=%d name=%s prev_cpu=%d", + p->se.avg.load_avg_ratio, cmp_up_threshold, p->pid, p->comm, prev_cpu); + return cmp_find_idle_cpu(&allowed_mask); + } + if (!bcpu && (p->se.avg.load_avg_ratio < cmp_down_threshold)) { + mt_sched_printf(sched_cmp, "[light task] wakeup load=%ld down_th=%u pid=%d name=%s prev_cpu=%d", + p->se.avg.load_avg_ratio, cmp_down_threshold, p->pid, p->comm, prev_cpu); + return cmp_find_idle_cpu(&allowed_mask); + } + } +#endif + for_each_cpu(j, &allowed_mask) { + if (idle_cpu(j)) + cpumask_set_cpu(j, &idle_mask); + } + + tg_cnt = p->group_leader->thread_group_info[i].nr_running; + mt_sched_printf(sched_cmp, "wakeup pid=%d name=%s load=%ld, cluster=%d allowed_cpu=%02lx, idle_cpu=%02lx tg_cnt=%d max_tg_cnt=%d, onlineCPU=%02lx", + p->pid, p->comm, p->se.avg.load_avg_ratio, i, *cpumask_bits(&allowed_mask), *cpumask_bits(&idle_mask), + tg_cnt, max_tg_cnt, *cpumask_bits(cpu_online_mask)); + + idle_cnt = cpumask_weight(&idle_mask); + if ( 0 == idle_cnt ) + continue; + + prev_cluster = (i==get_cluster_id(prev_cpu))? 1: 0; + + if (tg_cnt > 0) { + /* 1. prefer to into cluster with most thread group cluster*/ + if ( (tg_cnt > max_tg_cnt) || ((tg_cnt == max_tg_cnt) && prev_cluster)) { + in_prev = prev_cluster; + max_idle_cnt = idle_cnt; + max_tg_cnt = tg_cnt; + cpumask_copy(&max_idle_mask,&idle_mask); + } + } else if (0 == max_tg_cnt) { + /* 2. when no thread group, prefer put into the cluster with most idle CPU */ + if ((idle_cnt > max_idle_cnt) || ((idle_cnt == max_idle_cnt) && prev_cluster)) { + in_prev = prev_cluster; + max_idle_cnt = idle_cnt; + cpumask_copy(&max_idle_mask,&idle_mask); + } + + } + mt_sched_printf(sched_cmp, "wakeup %d %s cluster=%d, max_idle_cpu=%02lx max_tg_cnt=%d max_idle_cnt=%d prev_cpu=%d in_prev=%d", + p->pid, p->comm, i, *cpumask_bits(&max_idle_mask), max_tg_cnt, max_idle_cnt, prev_cpu, in_prev); + } + + /* no idle CPU */ + if ( !max_idle_cnt ) + return -1; + + if (in_prev && idle_cpu(prev_cpu)){ + mt_sched_printf(sched_cmp, "wakeup %d %s prev_cpu=%d", p->pid, p->comm, prev_cpu); + return prev_cpu; + } + + for_each_cpu(j, &max_idle_mask) { + if (idle_cpu(j)){ + mt_sched_printf(sched_cmp, "wakeup %d %s idle_cpu=%d", p->pid, p->comm, j); + return j; + } + } + + return -1; + +} + +static int mt_select_task_rq_fair(struct task_struct *p, int prev_cpu){ + return cmp_select_task_rq_fair(p, prev_cpu); +} + +#endif + +#if defined(CONFIG_MT_SCHED_INTEROP) && !defined(CONFIG_MTK_SCHED_CMP_TGS_WAKEUP) +static int interop_select_task_rq_fair(struct task_struct *p, int prev_cpu) +{ + int i; + struct cpumask allowed_mask; + + cpumask_and(&allowed_mask, cpu_online_mask, tsk_cpus_allowed(p)); + + mt_sched_printf(sched_interop, "prev cpu=%d, idle_prev=%d find idle cpu from cpumask 0x%lx", + prev_cpu, idle_cpu(prev_cpu), allowed_mask.bits[0] ); + + if(idle_cpu(prev_cpu)) + return prev_cpu; + + if(!cpumask_weight(&allowed_mask)) + return -1; + + for_each_cpu(i, &allowed_mask) { + if (idle_cpu(i)) + return i; + } + + return -1; + +} + +static int mt_select_task_rq_fair(struct task_struct *p, int prev_cpu){ + return interop_select_task_rq_fair(p, prev_cpu); +} +#endif + +#ifdef CONFIG_MTK_SCHED_TRACERS +#define LB_RESET 0 +#define LB_AFFINITY 0x10 +#define LB_BUDDY 0x20 +#define LB_FORK 0x30 +#define LB_CMP_SHIFT 8 +#define LB_CMP 0x4000 +#define LB_SMP_SHIFT 16 +#define LB_SMP 0x500000 +#define LB_HMP_SHIFT 24 +#define LB_HMP 0x60000000 +#endif + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +{ + struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int new_cpu = cpu; + int want_affine = 0; + int sync = wake_flags & WF_SYNC; +#if defined(CONFIG_SCHED_HMP) && !defined(CONFIG_SCHED_HMP_ENHANCEMENT) + int target_cpu = nr_cpu_ids; +#endif +#ifdef CONFIG_MTK_SCHED_TRACERS + int policy = 0; +#endif +#if defined(CONFIG_MTK_SCHED_CMP_TGS_WAKEUP) || defined(CONFIG_MT_SCHED_INTEROP) + int prefer_cpu; +#endif +#ifdef CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK + int buddy_cpu = per_cpu(sd_pack_buddy, cpu); +#endif + + if (p->nr_cpus_allowed == 1) + { +#ifdef CONFIG_MTK_SCHED_TRACERS + trace_sched_select_task_rq(p, (LB_AFFINITY | prev_cpu), prev_cpu, prev_cpu); +#endif + return prev_cpu; + } + +#ifdef CONFIG_HMP_PACK_SMALL_TASK +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + if (check_pack_buddy(cpu, p) && PA_ENABLE) { + PACK_FROM_CPUX_TO_CPUY_COUNT[cpu][per_cpu(sd_pack_buddy, cpu)]++; + +#ifdef CONFIG_HMP_TRACER + trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_PACK_FORM_CPUX_TO_CPUY, p->pid, cpu, per_cpu(sd_pack_buddy, cpu)); +#endif /* CONFIG_HMP_TRACER */ + + if(PA_MON_ENABLE) { + if(strcmp(p->comm, PA_MON) == 0 && cpu != per_cpu(sd_pack_buddy, cpu)) { + printk(KERN_EMERG "[PA] %s PACK From CPU%d to CPU%d\n", p->comm, cpu, per_cpu(sd_pack_buddy, cpu)); + printk(KERN_EMERG "[PA] Buddy RQ Usage = %u, Period = %u, NR = %u\n", + per_cpu(BUDDY_CPU_RQ_USAGE, per_cpu(sd_pack_buddy, cpu)), + per_cpu(BUDDY_CPU_RQ_PERIOD, per_cpu(sd_pack_buddy, cpu)), + per_cpu(BUDDY_CPU_RQ_NR, per_cpu(sd_pack_buddy, cpu))); + printk(KERN_EMERG "[PA] Task Usage = %u, Period = %u\n", + per_cpu(TASK_USGAE, cpu), + per_cpu(TASK_PERIOD, cpu)); + } + } +#else /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + if (check_pack_buddy(cpu, p)) { +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ +#ifdef CONFIG_MTK_SCHED_TRACERS + new_cpu = per_cpu(sd_pack_buddy, cpu); + trace_sched_select_task_rq(p, (LB_BUDDY | new_cpu), prev_cpu, new_cpu); +#endif + return per_cpu(sd_pack_buddy, cpu); + } +#elif defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK) +#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER + if (PA_ENABLE && (sd_flag & SD_BALANCE_WAKE) && (check_pack_buddy(buddy_cpu, p))) { +#else + if ((sd_flag & SD_BALANCE_WAKE) && (check_pack_buddy(buddy_cpu, p))) { +#endif + struct thread_group_info_t *src_tginfo, *dst_tginfo; + src_tginfo = &p->group_leader->thread_group_info[get_cluster_id(prev_cpu)]; //Compare with previous cpu(Not current cpu) + dst_tginfo = &p->group_leader->thread_group_info[get_cluster_id(buddy_cpu)]; + if((get_cluster_id(prev_cpu) == get_cluster_id(buddy_cpu)) || + (src_tginfo->nr_running < dst_tginfo->nr_running)) + { +#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER + PACK_FROM_CPUX_TO_CPUY_COUNT[cpu][buddy_cpu]++; + mt_sched_printf(sched_pa, "[PA]pid=%d, Pack to CPU%d(CPU%d's buddy)\n", p->pid,buddy_cpu,cpu); + if(PA_MON_ENABLE) { + u8 i=0; + for(i=0;i<4; i++) { + if(strcmp(p->comm, &PA_MON[i][0]) == 0) { + TASK_PACK_CPU_COUNT[i][buddy_cpu]++; + printk(KERN_EMERG "[PA] %s PACK to CPU%d(CPU%d's buddy), pre(cpu%d)\n", p->comm, buddy_cpu,cpu, prev_cpu); + printk(KERN_EMERG "[PA] Buddy RQ Usage = %u, Period = %u, NR = %u\n", + per_cpu(BUDDY_CPU_RQ_USAGE, buddy_cpu), + per_cpu(BUDDY_CPU_RQ_PERIOD, buddy_cpu), + per_cpu(BUDDY_CPU_RQ_NR, buddy_cpu)); + printk(KERN_EMERG "[PA] Task Usage = %u, Period = %u\n", + per_cpu(TASK_USGAE, cpu), + per_cpu(TASK_PERIOD, cpu)); + break; + } + } + } +#endif //CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER +#ifdef CONFIG_MTK_SCHED_TRACERS + trace_sched_select_task_rq(p, (LB_BUDDY | buddy_cpu), prev_cpu, buddy_cpu); +#endif + return buddy_cpu; + } + } +#endif /* CONFIG_HMP_PACK_SMALL_TASK */ + +#ifdef CONFIG_SCHED_HMP + /* always put non-kernel forking tasks on a big domain */ + if (p->mm && (sd_flag & SD_BALANCE_FORK)) { + if(hmp_cpu_is_fastest(prev_cpu)) { + struct hmp_domain *hmpdom = list_entry(&hmp_cpu_domain(prev_cpu)->hmp_domains, struct hmp_domain, hmp_domains); + __always_unused int lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu); + if(new_cpu < nr_cpu_ids && cpumask_test_cpu(new_cpu,tsk_cpus_allowed(p))) + { +#ifdef CONFIG_MTK_SCHED_TRACERS + trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu); +#endif + return new_cpu; + } + else + { + new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(p)); + if(new_cpu < nr_cpu_ids) + { +#ifdef CONFIG_MTK_SCHED_TRACERS + trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu); +#endif + return new_cpu; + } + } + } else { + new_cpu = hmp_select_faster_cpu(p, prev_cpu); + if (new_cpu < nr_cpu_ids) + { +#ifdef CONFIG_MTK_SCHED_TRACERS + trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu); +#endif + return new_cpu; + } + } + // to recover new_cpu value + if (new_cpu >= nr_cpu_ids) + new_cpu = cpu; + } +#endif + + if (sd_flag & SD_BALANCE_WAKE) { + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + want_affine = 1; + new_cpu = prev_cpu; + } + +#if defined(CONFIG_MTK_SCHED_CMP_TGS_WAKEUP) || defined(CONFIG_MT_SCHED_INTEROP) + prefer_cpu = mt_select_task_rq_fair(p, prev_cpu); + + if ((-1 != prefer_cpu) && (prefer_cpu < nr_cpu_ids)) { + cpu = prefer_cpu; + new_cpu = prefer_cpu; +#ifdef CONFIG_MTK_SCHED_TRACERS + policy |= (new_cpu << LB_CMP_SHIFT); + policy |= LB_CMP; +#endif + mt_sched_printf(sched_log, "cmp/interop wakeup %d %s to cpu %d", + p->pid, p->comm, cpu); + goto mt_found; + } +#endif + rcu_read_lock(); + for_each_domain(cpu, tmp) { + mt_sched_printf(sched_log, "wakeup %d %s tmp->flags=%x, cpu=%d, prev_cpu=%d, new_cpu=%d", + p->pid, p->comm, tmp->flags, cpu, prev_cpu, new_cpu); + + if (!(tmp->flags & SD_LOAD_BALANCE)) + continue; + + /* + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. + */ + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + affine_sd = tmp; + break; + } + + if (tmp->flags & sd_flag) + sd = tmp; + } + + if (affine_sd) { + if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + + new_cpu = select_idle_sibling(p, prev_cpu); + goto unlock; + } + + mt_sched_printf(sched_log, "wakeup %d %s sd=%p", p->pid, p->comm, sd); + + while (sd) { + int load_idx = sd->forkexec_idx; + struct sched_group *group; + int weight; + + mt_sched_printf(sched_log, "wakeup %d %s find_idlest_group cpu=%d sd->flags=%x sd_flag=%x", + p->pid, p->comm, cpu, sd->flags, sd_flag); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; + + mt_sched_printf(sched_log, "wakeup %d %s find_idlest_group cpu=%d", + p->pid, p->comm, cpu); + group = find_idlest_group(sd, p, cpu, load_idx); + if (!group) { + sd = sd->child; + mt_sched_printf(sched_log, "wakeup %d %s find_idlest_group child", + p->pid, p->comm); + continue; + } + + new_cpu = find_idlest_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + mt_sched_printf(sched_log, "wakeup %d %s find_idlest_cpu sd->child=%p", + p->pid, p->comm, sd); + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + mt_sched_printf(sched_log, "wakeup %d %s find_idlest_cpu cpu=%d sd=%p", + p->pid, p->comm, new_cpu, sd); + cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + mt_sched_printf(sched_log, "wakeup %d %s sd=%p weight=%d, tmp->span_weight=%d", + p->pid, p->comm, sd, weight, tmp->span_weight); + } + /* while loop will break here if sd == NULL */ + } + +#ifdef CONFIG_MTK_SCHED_TRACERS + policy |= (new_cpu << LB_SMP_SHIFT); + policy |= LB_SMP; +#endif + +unlock: + rcu_read_unlock(); + mt_sched_printf(sched_log, "wakeup %d %s new_cpu=%x", p->pid, p->comm, new_cpu); + +#if defined(CONFIG_MTK_SCHED_CMP_TGS_WAKEUP) || defined(CONFIG_MT_SCHED_INTEROP) +mt_found: +#endif + +#ifdef CONFIG_SCHED_HMP +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + new_cpu = hmp_select_task_rq_fair(sd_flag, p, prev_cpu, new_cpu); +#ifdef CONFIG_MTK_SCHED_TRACERS + policy |= (new_cpu << LB_HMP_SHIFT); + policy |= LB_HMP; +#endif + +#else + if (hmp_up_migration(prev_cpu, &target_cpu, &p->se)) { + new_cpu = hmp_select_faster_cpu(p, prev_cpu); + hmp_next_up_delay(&p->se, new_cpu); + trace_sched_hmp_migrate(p, new_cpu, 0); + return new_cpu; + } + if (hmp_down_migration(prev_cpu, &p->se)) { + new_cpu = hmp_select_slower_cpu(p, prev_cpu); + hmp_next_down_delay(&p->se, new_cpu); + trace_sched_hmp_migrate(p, new_cpu, 0); + return new_cpu; + } + /* Make sure that the task stays in its previous hmp domain */ + if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus)) + return prev_cpu; +#endif /* CONFIG_SCHED_HMP_ENHANCEMENT */ +#endif /* CONFIG_SCHED_HMP */ + +#ifdef CONFIG_MTK_SCHED_TRACERS + trace_sched_select_task_rq(p, policy, prev_cpu, new_cpu); +#endif + +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + if(PA_MON_ENABLE) { + if(strcmp(p->comm, PA_MON) == 0 && cpu != new_cpu) { + printk(KERN_EMERG "[PA] %s Select From CPU%d to CPU%d\n", p->comm, cpu, new_cpu); + } + } +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + + return new_cpu; +} + +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu. However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including the state of rq->lock, should be made. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Load tracking: accumulate removed load so that it can be processed + * when we next update owning cfs_rq under rq->lock. Tasks contribute + * to blocked load iff they have a positive decay-count. It can never + * be negative here since on-rq tasks have decay-count == 0. + */ + if (se->avg.decay_count) { + se->avg.decay_count = -__synchronize_entity_decay(se); + atomic_long_add(se->avg.load_avg_contrib, + &cfs_rq->removed_load); + } +} +#endif /* CONFIG_SMP */ + +static unsigned long +wakeup_gran(struct sched_entity *curr, struct sched_entity *se) +{ + unsigned long gran = sysctl_sched_wakeup_granularity; + + /* + * Since its curr running now, convert the gran from real-time + * to virtual-time in his units. + * + * By using 'se' instead of 'curr' we penalize light tasks, so + * they get preempted easier. That is, if 'se' < 'curr' then + * the resulting gran will be larger, therefore penalizing the + * lighter, if otoh 'se' > 'curr' then the resulting gran will + * be smaller, again penalizing the lighter task. + * + * This is especially important for buddies when the leftmost + * task is higher priority than the buddy. + */ + return calc_delta_fair(gran, se); +} + +/* + * Should 'se' preempt 'curr'. + * + * |s1 + * |s2 + * |s3 + * g + * |<--->|c + * + * w(c, s1) = -1 + * w(c, s2) = 0 + * w(c, s3) = 1 + * + */ +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +{ + s64 gran, vdiff = curr->vruntime - se->vruntime; + + if (vdiff <= 0) + return -1; + + gran = wakeup_gran(curr, se); + if (vdiff > gran) + return 1; + + return 0; +} + +static void set_last_buddy(struct sched_entity *se) +{ + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + return; + + for_each_sched_entity(se) + cfs_rq_of(se)->last = se; +} + +static void set_next_buddy(struct sched_entity *se) +{ + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + return; + + for_each_sched_entity(se) + cfs_rq_of(se)->next = se; +} + +static void set_skip_buddy(struct sched_entity *se) +{ + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + int scale = cfs_rq->nr_running >= sched_nr_latency; + int next_buddy_marked = 0; + + if (unlikely(se == pse)) + return; + + /* + * This is possible from callers such as move_task(), in which we + * unconditionally check_prempt_curr() after an enqueue (which may have + * lead to a throttle). This both saves work and prevents false + * next-buddy nomination below. + */ + if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + return; + + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { + set_next_buddy(pse); + next_buddy_marked = 1; + } + + /* + * We can come here with TIF_NEED_RESCHED already set from new task + * wake up path. + * + * Note: this also catches the edge-case of curr being in a throttled + * group (e.g. via set_curr_task), since update_curr() (in the + * enqueue of curr) will have resulted in resched being set. This + * prevents us from potentially nominating it as a false LAST_BUDDY + * below. + */ + if (test_tsk_need_resched(curr)) + return; + + /* Idle tasks are by definition preempted by non-idle tasks. */ + if (unlikely(curr->policy == SCHED_IDLE) && + likely(p->policy != SCHED_IDLE)) + goto preempt; + + /* + * Batch and idle tasks do not preempt non-idle tasks (their preemption + * is driven by the tick): + */ + if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) + return; + + find_matching_se(&se, &pse); + update_curr(cfs_rq_of(se)); + BUG_ON(!pse); + if (wakeup_preempt_entity(se, pse) == 1) { + /* + * Bias pick_next to pick the sched entity that is + * triggering this preemption. + */ + if (!next_buddy_marked) + set_next_buddy(pse); + goto preempt; + } + + return; + +preempt: + resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); +} + +static struct task_struct *pick_next_task_fair(struct rq *rq) +{ + struct task_struct *p; + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se; + + // in case nr_running!=0 but h_nr_running==0 + if (!cfs_rq->nr_running || !cfs_rq->h_nr_running) + return NULL; + + do { + se = pick_next_entity(cfs_rq); + set_next_entity(cfs_rq, se); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + if (hrtick_enabled(rq)) + hrtick_start_fair(rq, p); + + return p; +} + +/* + * Account for a descheduled task: + */ +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +{ + struct sched_entity *se = &prev->se; + struct cfs_rq *cfs_rq; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + put_prev_entity(cfs_rq, se); + } +} + +/* + * sched_yield() is very simple + * + * The magic of dealing with the ->skip buddy is in pick_next_entity. + */ +static void yield_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; + + /* + * Are we the only task in the tree? + */ + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq->skip_clock_update = 1; + } + + set_skip_buddy(se); +} + +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +{ + struct sched_entity *se = &p->se; + + /* throttled hierarchies are not runnable */ + if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + return false; + + /* Tell the scheduler that we'd really like pse to run next. */ + set_next_buddy(se); + + yield_task_fair(rq); + + return true; +} + +#ifdef CONFIG_SMP +/************************************************** + * Fair scheduling class load-balancing methods. + * + * BASICS + * + * The purpose of load-balancing is to achieve the same basic fairness the + * per-cpu scheduler provides, namely provide a proportional amount of compute + * time to each task. This is expressed in the following equation: + * + * W_i,n/P_i == W_j,n/P_j for all i,j (1) + * + * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight + * W_i,0 is defined as: + * + * W_i,0 = \Sum_j w_i,j (2) + * + * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight + * is derived from the nice value as per prio_to_weight[]. + * + * The weight average is an exponential decay average of the instantaneous + * weight: + * + * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) + * + * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * fraction of 'recent' time available for SCHED_OTHER task execution. But it + * can also include other factors [XXX]. + * + * To achieve this balance we define a measure of imbalance which follows + * directly from (1): + * + * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) + * + * We them move tasks around to minimize the imbalance. In the continuous + * function space it is obvious this converges, in the discrete case we get + * a few fun cases generally called infeasible weight scenarios. + * + * [XXX expand on: + * - infeasible weights; + * - local vs global optima in the discrete case. ] + * + * + * SCHED DOMAINS + * + * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) + * for all i,j solution, we create a tree of cpus that follows the hardware + * topology where each level pairs two lower groups (or better). This results + * in O(log n) layers. Furthermore we reduce the number of cpus going up the + * tree to only the first of the previous level and we decrease the frequency + * of load-balance at each level inv. proportional to the number of cpus in + * the groups. + * + * This yields: + * + * log_2 n 1 n + * \Sum { --- * --- * 2^i } = O(n) (5) + * i = 0 2^i 2^i + * `- size of each group + * | | `- number of cpus doing load-balance + * | `- freq + * `- sum over all levels + * + * Coupled with a limit on how many tasks we can migrate every balance pass, + * this makes (5) the runtime complexity of the balancer. + * + * An important property here is that each CPU is still (indirectly) connected + * to every other cpu in at most O(log n) steps: + * + * The adjacency matrix of the resulting graph is given by: + * + * log_2 n + * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) + * k = 0 + * + * And you'll find that: + * + * A^(log_2 n)_i,j != 0 for all i,j (7) + * + * Showing there's indeed a path between every cpu in at most O(log n) steps. + * The task movement gives a factor of O(m), giving a convergence complexity + * of: + * + * O(nm log n), n := nr_cpus, m := nr_tasks (8) + * + * + * WORK CONSERVING + * + * In order to avoid CPUs going idle while there's still work to do, new idle + * balancing is more aggressive and has the newly idle cpu iterate up the domain + * tree itself instead of relying on other CPUs to bring it work. + * + * This adds some complexity to both (5) and (8) but it reduces the total idle + * time. + * + * [XXX more?] + * + * + * CGROUPS + * + * Cgroups make a horror show out of (2), instead of a simple sum we get: + * + * s_k,i + * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) + * S_k + * + * Where + * + * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) + * + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. + * + * The big problem is S_k, its a global sum needed to compute a local (W_i) + * property. + * + * [XXX write more on how we solve this.. _after_ merging pjt's patches that + * rewrite all of this once again.] + */ + +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +#define LBF_ALL_PINNED 0x01 +#define LBF_NEED_BREAK 0x02 +#define LBF_SOME_PINNED 0x04 + +struct lb_env { + struct sched_domain *sd; + + struct rq *src_rq; + int src_cpu; + + int dst_cpu; + struct rq *dst_rq; + + struct cpumask *dst_grpmask; + int new_dst_cpu; + enum cpu_idle_type idle; + long imbalance; + /* The set of CPUs under consideration for load-balancing */ + struct cpumask *cpus; + + unsigned int flags; + + unsigned int loop; + unsigned int loop_break; + unsigned int loop_max; +#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT + int mt_check_cache_in_idle; +#endif +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + unsigned int fail_reason; +#endif +}; + +/* + * move_task - move a task from one runqueue to another runqueue. + * Both runqueues must be locked. + */ +static void move_task(struct task_struct *p, struct lb_env *env) +{ + deactivate_task(env->src_rq, p, 0); + set_task_cpu(p, env->dst_cpu); + activate_task(env->dst_rq, p, 0); + check_preempt_curr(env->dst_rq, p, 0); + +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + if(PA_MON_ENABLE) { + if(strcmp(p->comm, PA_MON) == 0) { + printk(KERN_EMERG "[PA] %s Balance From CPU%d to CPU%d\n", p->comm, env->src_rq->cpu, env->dst_rq->cpu); + } + } +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + +} + +/* + * Is this task likely cache-hot: + */ +#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) +static int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd, int mt_check_cache_in_idle) +#else +static int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +#endif +{ + s64 delta; + + if (p->sched_class != &fair_sched_class) + return 0; + + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + + /* + * Buddy candidates are cache hot: + */ +#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT + if (!mt_check_cache_in_idle){ + if ( !this_rq()->nr_running && (task_rq(p)->nr_running >= 2) ) + return 0; + } +#endif + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) + return 1; + + if (sysctl_sched_migration_cost == -1) + return 1; + if (sysctl_sched_migration_cost == 0) + return 0; + + delta = now - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +} + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static +int can_migrate_task(struct task_struct *p, struct lb_env *env) +{ + int tsk_cache_hot = 0; + /* + * We do not migrate tasks that are: + * 1) throttled_lb_pair, or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. + */ + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + return 0; + + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + int cpu; + + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_AFFINITY); + if(mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){ + char strings[128]=""; + snprintf(strings, 128, "%d:balance fail:affinity:%d:%d:%s:0x%lu" + , env->dst_cpu, env->src_cpu, p->pid, p->comm, p->cpus_allowed.bits[0]); + trace_sched_lbprof_log(strings); + } +#endif + + /* + * Remember if this task can be migrated to any other cpu in + * our sched_group. We may want to revisit it if we couldn't + * meet load balance goals by pulling other tasks on src_cpu. + * + * Also avoid computing new_dst_cpu if we have already computed + * one in current iteration. + */ + if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + return 0; + + /* Prevent to re-select dst_cpu via env's cpus */ + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = cpu; + break; + } + } + + return 0; + } + + /* Record that we found atleast one task that could run on dst_cpu */ + env->flags &= ~LBF_ALL_PINNED; + + if (task_running(env->src_rq, p)) { + schedstat_inc(p, se.statistics.nr_failed_migrations_running); +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_RUNNING); + if( mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){ + char strings[128]=""; + snprintf(strings, 128, "%d:balance fail:running:%d:%d:%s" + , env->dst_cpu, env->src_cpu, p->pid, p->comm); + trace_sched_lbprof_log(strings); + } +#endif + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ +#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd, env->mt_check_cache_in_idle); +#else + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); +#endif + if (!tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { + + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } + + return 1; + } + + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_CACHEHOT); + if(mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){ + char strings[128]=""; + snprintf(strings, 128, "%d:balance fail:cache hot:%d:%d:%s" + , env->dst_cpu, env->src_cpu, p->pid, p->comm); + trace_sched_lbprof_log(strings); + } +#endif + return 0; +} + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_one_task(struct lb_env *env) +{ + struct task_struct *p, *n; +#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT + env->mt_check_cache_in_idle = 1; +#endif +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + mt_lbprof_stat_set(env->fail_reason, MT_LBPROF_NO_TRIGGER); +#endif + + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { +#if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE) + if(need_lazy_balance(env->dst_cpu, env->src_cpu, p)) + continue; +#endif + if (!can_migrate_task(p, env)) + continue; + + move_task(p, env); + /* + * Right now, this is only the second place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; + } + return 0; +} + +static unsigned long task_h_load(struct task_struct *p); + +static const unsigned int sched_nr_migrate_break = 32; + +/* in second round load balance, we migrate heavy load_weight task + as long as RT tasks exist in busy cpu*/ +#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT + #define over_imbalance(lw, im) \ + (((lw)/2 > (im)) && \ + ((env->mt_check_cache_in_idle==1) || \ + (env->src_rq->rt.rt_nr_running==0) || \ + (pulled>0))) +#else + #define over_imbalance(lw, im) (((lw) / 2) > (im)) +#endif + +/* + * move_tasks tries to move up to imbalance weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct lb_env *env) +{ + struct list_head *tasks = &env->src_rq->cfs_tasks; + struct task_struct *p; + unsigned long load; + int pulled = 0; + + if (env->imbalance <= 0) + return 0; + + mt_sched_printf(sched_log, "move_tasks start "); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + + env->loop++; + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) + break; + + /* take a breather every nr_migrate tasks */ + if (env->loop > env->loop_break) { + env->loop_break += sched_nr_migrate_break; + env->flags |= LBF_NEED_BREAK; + break; + } +#if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE) + if(need_lazy_balance(env->dst_cpu, env->src_cpu, p)) + goto next; +#endif + if (!can_migrate_task(p, env)) + goto next; + + load = task_h_load(p); + + if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) + goto next; + + if (over_imbalance(load, env->imbalance)) + { + goto next; + } + + move_task(p, env); + pulled++; + env->imbalance -= load; + +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible + * kernels will stop after the first task is pulled to minimize + * the critical section. + */ + if (env->idle == CPU_NEWLY_IDLE) + break; +#endif + + /* + * We only want to steal up to the prescribed amount of + * weighted load. + */ + if (env->imbalance <= 0) + break; + + continue; +next: + list_move_tail(&p->se.group_node, tasks); + } + + /* + * Right now, this is one of only two places move_task() is called, + * so we can safely collect move_task() stats here rather than + * inside move_task(). + */ + schedstat_add(env->sd, lb_gained[env->idle], pulled); + + mt_sched_printf(sched_log, "move_tasks end"); + + return pulled; +} + +#ifdef CONFIG_MTK_SCHED_CMP +#ifdef CONFIG_MTK_SCHED_CMP_TGS +static int cmp_can_migrate_task(struct task_struct *p, struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + BUG_ON(sd == NULL); + + if (!(sd->flags & SD_BALANCE_TG)) + return 0; + + if (arch_is_multi_cluster()) { + int src_clid, dst_clid; + int src_nr_cpus; + struct thread_group_info_t *src_tginfo, *dst_tginfo; + + src_clid = get_cluster_id(env->src_cpu); + dst_clid = get_cluster_id(env->dst_cpu); + BUG_ON(dst_clid == -1 || src_clid == -1); + BUG_ON(p == NULL || p->group_leader == NULL); + src_tginfo = &p->group_leader->thread_group_info[src_clid]; + dst_tginfo = &p->group_leader->thread_group_info[dst_clid]; + src_nr_cpus = nr_cpus_in_cluster(src_clid, false); + + mt_sched_printf(sched_cmp_info, "check rule0: pid=%d comm=%s load=%ld src:clid=%d tginfo->nr_running=%ld nr_cpus=%d load_avg_ratio=%ld", + p->pid, p->comm, p->se.avg.load_avg_ratio, + src_clid, src_tginfo->nr_running, src_nr_cpus, + src_tginfo->load_avg_ratio); +#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP + if ( (!thread_group_empty(p)) && + (src_tginfo->nr_running <= src_nr_cpus) && + (src_tginfo->nr_running > dst_tginfo->nr_running)){ + mt_sched_printf(sched_log, "hit ruleA: bypass pid=%d comm=%s src:nr_running=%lu nr_cpus=%d dst:nr_running=%lu", + p->pid, p->comm, src_tginfo->nr_running, src_nr_cpus, dst_tginfo->nr_running); + return 0; + } +#endif + } + return 1; +} + +static int need_migrate_task_immediately(struct task_struct *p, + struct lb_env *env, struct clb_env *clbenv) +{ + struct sched_domain *sd = env->sd; + + BUG_ON(sd == NULL); + + if (arch_is_big_little()) { + mt_sched_printf(sched_cmp, "[%s] b.L arch", __func__); + mt_sched_printf(sched_cmp_info, "check rule0: pid=%d comm=%s src=%d dst=%d p->prio=%d p->se.avg.load_avg_ratio=%ld", + p->pid, p->comm, env->src_cpu, env->dst_cpu, p->prio, p->se.avg.load_avg_ratio); + /* from LITTLE to big */ + if (arch_cpu_is_little(env->src_cpu) && arch_cpu_is_big(env->dst_cpu)) { + BUG_ON(env->src_cpu != clbenv->ltarget); + if (p->se.avg.load_avg_ratio >= clbenv->bstats.threshold) + return 1; + + /* from big to LITTLE */ + } else if (arch_cpu_is_big(env->src_cpu) && arch_cpu_is_little(env->dst_cpu)) { + BUG_ON(env->src_cpu != clbenv->btarget); + if (p->se.avg.load_avg_ratio < clbenv->lstats.threshold) + return 1; + } + return 0; + } + + if (arch_is_multi_cluster() && (sd->flags & SD_BALANCE_TG)) { + int src_clid, dst_clid; + int src_nr_cpus; + struct thread_group_info_t *src_tginfo, *dst_tginfo; + + src_clid = get_cluster_id(env->src_cpu); + dst_clid = get_cluster_id(env->dst_cpu); + BUG_ON(dst_clid == -1 || src_clid == -1); + BUG_ON(p == NULL || p->group_leader == NULL); + src_tginfo = &p->group_leader->thread_group_info[src_clid]; + dst_tginfo = &p->group_leader->thread_group_info[dst_clid]; + src_nr_cpus = nr_cpus_in_cluster(src_clid, false); + mt_sched_printf(sched_cmp, "[%s] L.L arch", __func__); + + if ((p->se.avg.load_avg_ratio*4 >= NICE_0_LOAD*3) && + src_tginfo->nr_running > src_nr_cpus && + src_tginfo->load_avg_ratio*10 > NICE_0_LOAD*src_nr_cpus*9) { + //pr_warn("[%s] hit rule0, candidate_load_move/load_move (%ld/%ld)\n", + // __func__, candidate_load_move, env->imbalance); + return 1; + } + } + + return 0; +} +#endif + +/* + * move_tasks tries to move up to load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int cmp_move_tasks(struct sched_domain *sd, struct lb_env *env) +{ + struct list_head *tasks = &env->src_rq->cfs_tasks; + struct task_struct *p; + unsigned long load = 0; + int pulled = 0; + + long tg_load_move, other_load_move; + struct list_head tg_tasks, other_tasks; + int src_clid, dst_clid; +#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP + struct cpumask tmp, *cpus = &tmp; +#endif +#ifdef MTK_QUICK + int flag = 0; +#endif + struct clb_env clbenv; + struct cpumask srcmask, dstmask; + + if (env->imbalance <= 0) + return 0; + + other_load_move = env->imbalance; + INIT_LIST_HEAD(&other_tasks); + +// if (sd->flags & SD_BALANCE_TG) { + tg_load_move = env->imbalance; + INIT_LIST_HEAD(&tg_tasks); + src_clid = get_cluster_id(env->src_cpu); + dst_clid = get_cluster_id(env->dst_cpu); + BUG_ON(dst_clid == -1 || src_clid == -1); + +#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP + get_cluster_cpus(cpus, src_clid, true); +#endif + mt_sched_printf(sched_cmp, "move_tasks_tg start: src:cpu=%d clid=%d runnable_load=%lu dst:cpu=%d clid=%d runnable_load=%lu imbalance=%ld curr->on_rq=%d", + env->src_cpu, src_clid, cpu_rq(env->src_cpu)->cfs.runnable_load_avg, + env->dst_cpu, dst_clid, cpu_rq(env->dst_cpu)->cfs.runnable_load_avg, + env->imbalance, env->dst_rq->curr->on_rq); +// } + + mt_sched_printf(sched_cmp, "max=%d busiest->nr_running=%d", + env->loop_max, cpu_rq(env->src_cpu)->nr_running); + + if (arch_is_big_little()) { + get_cluster_cpus(&srcmask, src_clid, true); + get_cluster_cpus(&dstmask, dst_clid, true); + memset(&clbenv, 0, sizeof(clbenv)); + clbenv.flags |= HMP_LB; + clbenv.ltarget = arch_cpu_is_little(env->src_cpu) ? env->src_cpu : env->dst_cpu; + clbenv.btarget = arch_cpu_is_big(env->src_cpu) ? env->src_cpu : env->dst_cpu; + clbenv.lcpus = arch_cpu_is_little(env->src_cpu) ? &srcmask : &dstmask; + clbenv.bcpus = arch_cpu_is_big(env->src_cpu) ? &srcmask : &dstmask; + sched_update_clbstats(&clbenv); + } + + while (!list_empty(tasks)) { + struct thread_group_info_t *src_tginfo, *dst_tginfo; + + p = list_first_entry(tasks, struct task_struct, se.group_node); + + mt_sched_printf(sched_cmp_info, "check: pid=%d comm=%s load_avg_contrib=%lu h_load=%lu runnable_load_avg=%lu loop=%d, env->imbalance=%ld tg_load_move=%ld", + p->pid, p->comm, p->se.avg.load_avg_contrib, + task_cfs_rq(p)->h_load, task_cfs_rq(p)->runnable_load_avg, + env->loop, env->imbalance, tg_load_move); + env->loop++; + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) + break; + +#if 0 // TO check + /* take a breather every nr_migrate tasks */ + if (env->loop > env->loop_break) { + env->loop_break += sched_nr_migrate_break; + env->flags |= LBF_NEED_BREAK; + break; + } +#endif + BUG_ON(p == NULL || p->group_leader == NULL); + src_tginfo = &p->group_leader->thread_group_info[src_clid]; + dst_tginfo = &p->group_leader->thread_group_info[dst_clid]; + + /* rule0 */ + if (!can_migrate_task(p, env)) { + mt_sched_printf(sched_cmp, "can not migrate: pid=%d comm=%s", + p->pid, p->comm); + goto next; + } + + load = task_h_load(p); + + if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) { + mt_sched_printf(sched_cmp, "can not migrate: pid=%d comm=%s sched_feat", + p->pid, p->comm ); + goto next; + } + + if (over_imbalance(load, env->imbalance)) { + mt_sched_printf(sched_cmp, "can not migrate: pid=%d comm=%s load=%ld imbalance=%ld", + p->pid, p->comm, load, env->imbalance ); + goto next; + } + + /* meet rule0 , migrate immediately */ + if (need_migrate_task_immediately(p, env, &clbenv)) { + pulled++; + env->imbalance -= load; + tg_load_move -= load; + other_load_move -= load; + mt_sched_printf(sched_cmp, "hit rule0: pid=%d comm=%s load=%ld imbalance=%ld tg_imbalance=%ld other_load_move=%ld", + p->pid, p->comm, load, env->imbalance, tg_load_move, other_load_move); + move_task(p, env); + if (env->imbalance <= 0) + break; + continue; + } + + /* for TGS */ + if (!cmp_can_migrate_task(p, env)) + goto next; + + if (sd->flags & SD_BALANCE_TG){ + if (over_imbalance(load, tg_load_move)) { + mt_sched_printf(sched_cmp, "can not migrate: pid=%d comm=%s load=%ld imbalance=%ld", + p->pid, p->comm, load, tg_load_move ); + goto next; + } + +#ifdef MTK_QUICK + if (candidate_load_move <= 0) { + mt_sched_printf(sched_cmp, "check: pid=%d comm=%s candidate_load_move=%d", + p->pid, p->comm, candidate_load_move); + goto next; + } +#endif + + /* rule1, single thread */ + mt_sched_printf(sched_cmp_info, "check rule1: pid=%d p->comm=%s thread_group_cnt=%lu thread_group_empty(p)=%d", + p->pid, p->comm, + p->group_leader->thread_group_info[0].nr_running + + p->group_leader->thread_group_info[1].nr_running, + thread_group_empty(p)); + + if (thread_group_empty(p)) { + list_move_tail(&p->se.group_node, &tg_tasks); + tg_load_move -= load; + other_load_move -= load; + mt_sched_printf(sched_cmp, "hit rule1: pid=%d p->comm=%s load=%ld tg_imbalance=%ld", + p->pid, p->comm, load, tg_load_move); + continue; + } + + /* rule2 */ + mt_sched_printf(sched_cmp_info, "check rule2: pid=%d p->comm=%s %ld, %ld, %ld, %ld, %ld", + p->pid, p->comm, src_tginfo->nr_running, src_tginfo->cfs_nr_running, dst_tginfo->nr_running, + p->se.avg.load_avg_ratio, src_tginfo->load_avg_ratio); + if ((src_tginfo->nr_running < dst_tginfo->nr_running) && + ((p->se.avg.load_avg_ratio * src_tginfo->cfs_nr_running) <= + src_tginfo->load_avg_ratio)) { + list_move_tail(&p->se.group_node, &tg_tasks); + tg_load_move -= load; + other_load_move -= load; + mt_sched_printf(sched_cmp, "hit rule2: pid=%d p->comm=%s load=%ld tg_imbalance=%ld", + p->pid, p->comm, load, tg_load_move); + continue; + } + + if (over_imbalance(load, other_load_move)) + goto next; +/* + if (other_load_move <= 0) + goto next; +*/ + + list_move_tail(&p->se.group_node, &other_tasks); + other_load_move -= load; + continue; + }else{ + list_move_tail(&p->se.group_node, &other_tasks); + other_load_move -= load; + continue; + } + + // ytchang +#if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE) + if(need_lazy_balance(env->dst_cpu, env->src_cpu, p)) + goto next; +#endif + +next: + /* original rule */ + list_move_tail(&p->se.group_node, tasks); + } // end of while() + + if ( sd->flags & SD_BALANCE_TG){ + while (!list_empty(&tg_tasks)) { + p = list_first_entry(&tg_tasks, struct task_struct, se.group_node); + list_move_tail(&p->se.group_node, tasks); + + if (env->imbalance > 0) { + load = task_h_load(p); + if (over_imbalance(load, env->imbalance)){ + mt_sched_printf(sched_cmp, "overload rule1,2: pid=%d p->comm=%s load=%ld imbalance=%ld", + p->pid, p->comm, load, env->imbalance); +#ifdef MTK_QUICK + + flag=1; +#endif + continue; + } + + move_task(p, env); + env->imbalance -= load; + pulled++; + + mt_sched_printf(sched_cmp, "migrate hit rule1,2: pid=%d p->comm=%s load=%ld imbalance=%ld", + p->pid, p->comm, load, env->imbalance); + } + } + } + + mt_sched_printf(sched_cmp, "move_tasks_tg finish rule migrate"); + + while (!list_empty(&other_tasks)) { + p = list_first_entry(&other_tasks, struct task_struct, se.group_node); + list_move_tail(&p->se.group_node, tasks); + +#ifdef MTK_QUICK + if (!flag && (env->imbalance > 0)) { +#else + if (env->imbalance > 0) { +#endif + load = task_h_load(p); + + if (over_imbalance(load, env->imbalance)){ + mt_sched_printf(sched_cmp, "overload others: pid=%d p->comm=%s load=%ld imbalance=%ld", + p->pid, p->comm, load, env->imbalance); + continue; + } + + move_task(p, env); + env->imbalance -= load; + pulled++; + + mt_sched_printf(sched_cmp, "migrate others: pid=%d p->comm=%s load=%ld imbalance=%ld", + p->pid, p->comm, load, env->imbalance); + } + } + + /* + * Right now, this is one of only two places move_task() is called, + * so we can safely collect move_task() stats here rather than + * inside move_task(). + */ + schedstat_add(env->sd, lb_gained[env->idle], pulled); + + mt_sched_printf(sched_cmp, "move_tasks_tg finish pulled=%d imbalance=%ld", pulled, env->imbalance); + + return pulled; +} + +#endif /* CONFIG_MTK_SCHED_CMP */ + + +#if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE) +static int need_lazy_balance(int dst_cpu, int src_cpu, struct task_struct *p) +{ + /* Lazy balnace for small task + 1. src cpu is buddy cpu + 2. src cpu is not busy cpu + 3. p is light task + */ +#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER + if ( PA_ENABLE && cpumask_test_cpu(src_cpu, &buddy_cpu_map) && + !is_buddy_busy(src_cpu) && is_light_task(p)) { +#else + if (cpumask_test_cpu(src_cpu, &buddy_cpu_map) && + !is_buddy_busy(src_cpu) && is_light_task(p)) { +#endif +#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER + unsigned int i; + AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[src_cpu][dst_cpu]++; + mt_sched_printf(sched_pa, "[PA]pid=%d, Lazy balance from CPU%d to CPU%d\n)\n", p->pid, src_cpu, dst_cpu); + for(i=0;i<4;i++) { + if(PA_MON_ENABLE && (strcmp(p->comm, &PA_MON[i][0]) == 0)) { + printk(KERN_EMERG "[PA] %s Lazy balance from CPU%d to CPU%d\n", p->comm, src_cpu, dst_cpu); + // printk(KERN_EMERG "[PA] src_cpu RQ Usage = %u, Period = %u, NR = %u\n", + // per_cpu(BUDDY_CPU_RQ_USAGE, src_cpu), + // per_cpu(BUDDY_CPU_RQ_PERIOD, src_cpu), + // per_cpu(BUDDY_CPU_RQ_NR, src_cpu)); + // printk(KERN_EMERG "[PA] Task Usage = %u, Period = %u\n", + // p->se.avg.usage_avg_sum, + // p->se.avg.runnable_avg_period); + } + } +#endif + return 1; + } + else + return 0; +} +#endif +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * update tg->load_weight by folding this cpu's load_avg + */ +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu) +{ + struct sched_entity *se = tg->se[cpu]; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + return; + + update_cfs_rq_blocked_load(cfs_rq, 1); + + if (se) { + update_entity_load_avg(se, 1); + /* + * We pivot on our runnable average having decayed to zero for + * list removal. This generally implies that all our children + * have also been removed (modulo rounding error or bandwidth + * control); however, such cases are rare and we can fix these + * at enqueue. + * + * TODO: fix up out-of-order children on enqueue. + */ + if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) + list_del_leaf_cfs_rq(cfs_rq); + } else { + struct rq *rq = rq_of(cfs_rq); + update_rq_runnable_avg(rq, rq->nr_running); + } +} + +static void update_blocked_averages(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct cfs_rq *cfs_rq; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + /* + * Iterates the task_group tree in a bottom up fashion, see + * list_add_leaf_cfs_rq() for details. + */ + for_each_leaf_cfs_rq(rq, cfs_rq) { + /* + * Note: We may want to consider periodically releasing + * rq->lock about these updates so that creating many task + * groups does not result in continually extending hold time. + */ + __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu); + } + + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +/* + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. + */ +static int tg_load_down(struct task_group *tg, void *data) +{ + unsigned long load; + long cpu = (long)data; + + if (!tg->parent) { + /* + * rq's sched_avg is not updated accordingly. adopt rq's + * corresponding cfs_rq runnable loading instead. + * + * a003a25b sched: Consider runnable load average... + * + + load = cpu_rq(cpu)->avg.load_avg_contrib; + + */ + load = cpu_rq(cpu)->cfs.runnable_load_avg; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, + tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); + } + + tg->cfs_rq[cpu]->h_load = load; + + return 0; +} + +static void update_h_load(long cpu) +{ + rcu_read_lock(); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); + rcu_read_unlock(); +} + +static unsigned long task_h_load(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, + cfs_rq->runnable_load_avg + 1); +} +#else +static inline void update_blocked_averages(int cpu) +{ +} + +static inline void update_h_load(long cpu) +{ +} + +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.avg.load_avg_contrib; +} +#endif + +/********** Helpers for find_busiest_group ************************/ +/* + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *this; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + + /** Statistics of this group */ + unsigned long this_load; + unsigned long this_load_per_task; + unsigned long this_nr_running; + unsigned long this_has_capacity; + unsigned int this_idle_cpus; + + /* Statistics of the busiest group */ + unsigned int busiest_idle_cpus; + unsigned long max_load; + unsigned long busiest_load_per_task; + unsigned long busiest_nr_running; + unsigned long busiest_group_capacity; + unsigned long busiest_has_capacity; + unsigned int busiest_group_weight; + + int group_imb; /* Is there imbalance in this sd */ +}; + +/* + * sg_lb_stats - stats of a sched_group required for load_balancing + */ +struct sg_lb_stats { + unsigned long avg_load; /*Avg load across the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long sum_nr_running; /* Nr tasks running in the group */ + unsigned long sum_weighted_load; /* Weighted load of group's tasks */ + unsigned long group_capacity; + unsigned long idle_cpus; + unsigned long group_weight; + int group_imb; /* Is there an imbalance in the group ? */ + int group_has_capacity; /* Is there extra capacity in the group? */ +}; + +/** + * get_sd_load_idx - Obtain the load index for a given sched domain. + * @sd: The sched_domain whose load_idx is to be obtained. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + */ +static inline int get_sd_load_idx(struct sched_domain *sd, + enum cpu_idle_type idle) +{ + int load_idx; + + switch (idle) { + case CPU_NOT_IDLE: + load_idx = sd->busy_idx; + break; + + case CPU_NEWLY_IDLE: + load_idx = sd->newidle_idx; + break; + default: + load_idx = sd->idle_idx; + break; + } + + return load_idx; +} + +static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return SCHED_POWER_SCALE; +} + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return default_scale_freq_power(sd, cpu); +} + +static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = sd->span_weight; + unsigned long smt_gain = sd->smt_gain; + + smt_gain /= weight; + + return smt_gain; +} + +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + return default_scale_smt_power(sd, cpu); +} + +static unsigned long scale_rt_power(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 total, available, age_stamp, avg; + + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = ACCESS_ONCE(rq->age_stamp); + avg = ACCESS_ONCE(rq->rt_avg); + + total = sched_avg_period() + (rq->clock - age_stamp); + + if (unlikely(total < avg)) { + /* Ensures that power won't end up being negative */ + available = 0; + } else { + available = total - avg; + } + + if (unlikely((s64)total < SCHED_POWER_SCALE)) + total = SCHED_POWER_SCALE; + + total >>= SCHED_POWER_SHIFT; + + return div_u64(available, total); +} + +static void update_cpu_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = sd->span_weight; + unsigned long power = SCHED_POWER_SCALE; + struct sched_group *sdg = sd->groups; + + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + if (sched_feat(ARCH_POWER)) + power *= arch_scale_smt_power(sd, cpu); + else + power *= default_scale_smt_power(sd, cpu); + + power >>= SCHED_POWER_SHIFT; + } + + sdg->sgp->power_orig = power; + + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_POWER_SHIFT; + + power *= scale_rt_power(cpu); + power >>= SCHED_POWER_SHIFT; + + if (!power) + power = 1; + + cpu_rq(cpu)->cpu_power = power; + sdg->sgp->power = power; +} + +void update_group_power(struct sched_domain *sd, int cpu) +{ + struct sched_domain *child = sd->child; + struct sched_group *group, *sdg = sd->groups; + unsigned long power; + unsigned long interval; + + interval = msecs_to_jiffies(sd->balance_interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + sdg->sgp->next_update = jiffies + interval; + + if (!child) { + update_cpu_power(sd, cpu); + return; + } + + power = 0; + + if (child->flags & SD_OVERLAP) { + /* + * SD_OVERLAP domains cannot assume that child groups + * span the current group. + */ + + for_each_cpu(cpu, sched_group_cpus(sdg)) + power += power_of(cpu); + } else { + /* + * !SD_OVERLAP domains can assume that child groups + * span the current group. + */ + + group = child->groups; + do { + power += group->sgp->power; + group = group->next; + } while (group != child->groups); + } + + sdg->sgp->power_orig = sdg->sgp->power = power; +} + +/* + * Try and fix up capacity for tiny siblings, this is needed when + * things like SD_ASYM_PACKING need f_b_g to select another sibling + * which on its own isn't powerful enough. + * + * See update_sd_pick_busiest() and check_asym_packing(). + */ +static inline int +fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +{ + /* + * Only siblings can have significantly less than SCHED_POWER_SCALE + */ + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return 0; + + /* + * If ~90% of the cpu_power is still there, we're good. + */ + if (group->sgp->power * 32 > group->sgp->power_orig * 29) + return 1; + + return 0; +} + +/** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @env: The load balancing environment. + * @group: sched_group whose statistics are to be updated. + * @load_idx: Load index of sched_domain of this_cpu for load calc. + * @local_group: Does group contain this_cpu. + * @balance: Should we balance. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_lb_stats(struct lb_env *env, + struct sched_group *group, int load_idx, + int local_group, int *balance, struct sg_lb_stats *sgs) +{ + unsigned long nr_running, max_nr_running, min_nr_running; + unsigned long load, max_cpu_load, min_cpu_load; + unsigned int balance_cpu = -1, first_idle_cpu = 0; + unsigned long avg_load_per_task = 0; + int i; + + if (local_group) + balance_cpu = group_balance_cpu(group); + + /* Tally up the load of all CPUs in the group */ + max_cpu_load = 0; + min_cpu_load = ~0UL; + max_nr_running = 0; + min_nr_running = ~0UL; + + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + struct rq *rq = cpu_rq(i); + + nr_running = rq->nr_running; + + /* Bias balancing toward cpus of our domain */ + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu && + cpumask_test_cpu(i, sched_group_mask(group))) { + first_idle_cpu = 1; + balance_cpu = i; + } + + load = target_load(i, load_idx); + } else { + load = source_load(i, load_idx); + if (load > max_cpu_load) + max_cpu_load = load; + if (min_cpu_load > load) + min_cpu_load = load; + + if (nr_running > max_nr_running) + max_nr_running = nr_running; + if (min_nr_running > nr_running) + min_nr_running = nr_running; + +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + if((load_idx > 0) && (load == cpu_rq(i)->cpu_load[load_idx-1])) + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_HISTORY); +#endif + } + + sgs->group_load += load; + sgs->sum_nr_running += nr_running; + sgs->sum_weighted_load += weighted_cpuload(i); + if (idle_cpu(i)) + sgs->idle_cpus++; + } + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (local_group) { + if (env->idle != CPU_NEWLY_IDLE) { + if (balance_cpu != env->dst_cpu) { + *balance = 0; + return; + } + update_group_power(env->sd, env->dst_cpu); + } else if (time_after_eq(jiffies, group->sgp->next_update)) + update_group_power(env->sd, env->dst_cpu); + } + + /* Adjust by relative CPU power of the group */ + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of a task. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + if (sgs->sum_nr_running) + avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + + if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && + (max_nr_running - min_nr_running) > 1) + sgs->group_imb = 1; + + sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, + SCHED_POWER_SCALE); + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(env->sd, group); + sgs->group_weight = group->group_weight; + + if (sgs->group_capacity > sgs->sum_nr_running) + sgs->group_has_capacity = 1; +} + +/** + * update_sd_pick_busiest - return 1 on busiest group + * @env: The load balancing environment. + * @sds: sched_domain statistics + * @sg: sched_group candidate to be checked for being the busiest + * @sgs: sched_group statistics + * + * Determine if @sg is a busier group than the previously selected + * busiest group. + */ +static bool update_sd_pick_busiest(struct lb_env *env, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs) +{ + if (sgs->avg_load <= sds->max_load) { + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_PICK_BUSIEST_FAIL_1); + return false; + } + + if (sgs->sum_nr_running > sgs->group_capacity) + return true; + + if (sgs->group_imb) + return true; + + /* + * ASYM_PACKING needs to move all the work to the lowest + * numbered CPUs in the group, therefore mark all groups + * higher than ourself as busy. + */ + if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + env->dst_cpu < group_first_cpu(sg)) { + if (!sds->busiest) + return true; + + if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + return true; + } + + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_PICK_BUSIEST_FAIL_2); + return false; +} + +/** + * update_sd_lb_stats - Update sched_domain's statistics for load balancing. + * @env: The load balancing environment. + * @balance: Should we balance. + * @sds: variable to hold the statistics for this sched_domain. + */ +static inline void update_sd_lb_stats(struct lb_env *env, + int *balance, struct sd_lb_stats *sds) +{ + struct sched_domain *child = env->sd->child; + struct sched_group *sg = env->sd->groups; + struct sg_lb_stats sgs; + int load_idx, prefer_sibling = 0; + + if (child && child->flags & SD_PREFER_SIBLING) + prefer_sibling = 1; + + load_idx = get_sd_load_idx(env->sd, env->idle); + + do { + int local_group; + + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); + memset(&sgs, 0, sizeof(sgs)); + update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); + + if (local_group && !(*balance)) + return; + + sds->total_load += sgs.group_load; + sds->total_pwr += sg->sgp->power; + + /* + * In case the child domain prefers tasks go to siblings + * first, lower the sg capacity to one so that we'll try + * and move all the excess tasks away. We lower the capacity + * of a group only if the local group has the capacity to fit + * these excess tasks, i.e. nr_running < group_capacity. The + * extra check prevents the case where you always pull from the + * heaviest group when it is already under-utilized (possible + * with a large weight task outweighs the tasks on the system). + */ + if (prefer_sibling && !local_group && sds->this_has_capacity) + sgs.group_capacity = min(sgs.group_capacity, 1UL); + + if (local_group) { + sds->this_load = sgs.avg_load; + sds->this = sg; + sds->this_nr_running = sgs.sum_nr_running; + sds->this_load_per_task = sgs.sum_weighted_load; + sds->this_has_capacity = sgs.group_has_capacity; + sds->this_idle_cpus = sgs.idle_cpus; + } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { + sds->max_load = sgs.avg_load; + sds->busiest = sg; + sds->busiest_nr_running = sgs.sum_nr_running; + sds->busiest_idle_cpus = sgs.idle_cpus; + sds->busiest_group_capacity = sgs.group_capacity; + sds->busiest_load_per_task = sgs.sum_weighted_load; + sds->busiest_has_capacity = sgs.group_has_capacity; + sds->busiest_group_weight = sgs.group_weight; + sds->group_imb = sgs.group_imb; + } + + sg = sg->next; + } while (sg != env->sd->groups); +} + +/** + * check_asym_packing - Check to see if the group is packed into the + * sched doman. + * + * This is primarily intended to used at the sibling level. Some + * cores like POWER7 prefer to use lower numbered SMT threads. In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle. When in lower SMT modes, the threads will + * perform better since they share less core resources. Hence when we + * have idle threads, we want them to be the higher ones. + * + * This packing function is run on idle threads. It checks to see if + * the busiest CPU in this domain (core in the P7 case) has a higher + * CPU number than the packing function is being run on. Here we are + * assuming lower CPU number will be equivalent to lower a SMT thread + * number. + * + * Returns 1 when packing is required and a task should be moved to + * this CPU. The amount of the imbalance is returned in *imbalance. + * + * @env: The load balancing environment. + * @sds: Statistics of the sched_domain which is to be packed + */ +static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) +{ + int busiest_cpu; + + if (!(env->sd->flags & SD_ASYM_PACKING)) + return 0; + + if (!sds->busiest) + return 0; + + busiest_cpu = group_first_cpu(sds->busiest); + if (env->dst_cpu > busiest_cpu) + return 0; + + env->imbalance = DIV_ROUND_CLOSEST( + sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); + + return 1; +} + +/** + * fix_small_imbalance - Calculate the minor imbalance that exists + * amongst the groups of a sched_domain, during + * load balancing. + * @env: The load balancing environment. + * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + */ +static inline +void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) +{ + unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned int imbn = 2; + unsigned long scaled_busy_load_per_task; + + if (sds->this_nr_running) { + sds->this_load_per_task /= sds->this_nr_running; + if (sds->busiest_load_per_task > + sds->this_load_per_task) + imbn = 1; + } else { + sds->this_load_per_task = + cpu_avg_load_per_task(env->dst_cpu); + } + + scaled_busy_load_per_task = sds->busiest_load_per_task + * SCHED_POWER_SCALE; + scaled_busy_load_per_task /= sds->busiest->sgp->power; + + if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= + (scaled_busy_load_per_task * imbn)) { + env->imbalance = sds->busiest_load_per_task; + return; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += sds->busiest->sgp->power * + min(sds->busiest_load_per_task, sds->max_load); + pwr_now += sds->this->sgp->power * + min(sds->this_load_per_task, sds->this_load); + pwr_now /= SCHED_POWER_SCALE; + + /* Amount of load we'd subtract */ + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / + sds->busiest->sgp->power; + if (sds->max_load > tmp) + pwr_move += sds->busiest->sgp->power * + min(sds->busiest_load_per_task, sds->max_load - tmp); + + /* Amount of load we'd add */ + if (sds->max_load * sds->busiest->sgp->power < + sds->busiest_load_per_task * SCHED_POWER_SCALE) + tmp = (sds->max_load * sds->busiest->sgp->power) / + sds->this->sgp->power; + else + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / + sds->this->sgp->power; + pwr_move += sds->this->sgp->power * + min(sds->this_load_per_task, sds->this_load + tmp); + pwr_move /= SCHED_POWER_SCALE; + + /* Move if we gain throughput */ + if (pwr_move > pwr_now) + env->imbalance = sds->busiest_load_per_task; +} + +/** + * calculate_imbalance - Calculate the amount of imbalance present within the + * groups of a given sched_domain during load balance. + * @env: load balance environment + * @sds: statistics of the sched_domain whose imbalance is to be calculated. + */ +static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) +{ + unsigned long max_pull, load_above_capacity = ~0UL; + + sds->busiest_load_per_task /= sds->busiest_nr_running; + if (sds->group_imb) { + sds->busiest_load_per_task = + min(sds->busiest_load_per_task, sds->avg_load); + } + + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ + if (sds->max_load < sds->avg_load) { + env->imbalance = 0; + return fix_small_imbalance(env, sds); + } + + if (!sds->group_imb) { + /* + * Don't want to pull so many tasks that a group would go idle. + */ + load_above_capacity = (sds->busiest_nr_running - + sds->busiest_group_capacity); + + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); + + load_above_capacity /= sds->busiest->sgp->power; + } + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load. At the same time, + * we also don't want to reduce the group load below the group capacity + * (so that we can implement power-savings policies etc). Thus we look + * for the minimum possible imbalance. + * Be careful of negative numbers as they'll appear as very large values + * with unsigned longs. + */ + max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); + + /* How much load to actually move to equalise the imbalance */ + env->imbalance = min(max_pull * sds->busiest->sgp->power, + (sds->avg_load - sds->this_load) * sds->this->sgp->power) + / SCHED_POWER_SCALE; + + /* + * if *imbalance is less than the average load per runnable task + * there is no guarantee that any tasks will be moved so we'll have + * a think about bumping its value to force at least one task to be + * moved + */ + if (env->imbalance < sds->busiest_load_per_task) + return fix_small_imbalance(env, sds); + +} + +/******* find_busiest_group() helpers end here *********************/ + +/** + * find_busiest_group - Returns the busiest group within the sched_domain + * if there is an imbalance. If there isn't an imbalance, and + * the user has opted for power-savings, it returns a group whose + * CPUs can be put to idle by rebalancing those tasks elsewhere, if + * such a group exists. + * + * Also calculates the amount of weighted load which should be moved + * to restore balance. + * + * @env: The load balancing environment. + * @balance: Pointer to a variable indicating if this_cpu + * is the appropriate cpu to perform load balancing at this_level. + * + * Returns: - the busiest group if imbalance exists. + * - If no imbalance and user has opted for power-savings balance, + * return the least loaded group whose CPUs can be + * put to idle by rebalancing its tasks onto our group. + */ +static struct sched_group * +find_busiest_group(struct lb_env *env, int *balance) +{ + struct sd_lb_stats sds; + + memset(&sds, 0, sizeof(sds)); + + /* + * Compute the various statistics relavent for load balancing at + * this level. + */ + update_sd_lb_stats(env, balance, &sds); + + /* + * this_cpu is not the appropriate cpu to perform load balancing at + * this level. + */ + if (!(*balance)){ + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_BALANCE); + goto ret; + } + + if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && + check_asym_packing(env, &sds)) + return sds.busiest; + + /* There is no busy sibling group to pull tasks from */ + if (!sds.busiest || sds.busiest_nr_running == 0){ + if(!sds.busiest){ + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_BUSIEST_NO_TASK); + }else{ + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_BUSIEST); + } + goto out_balanced; + } + + sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + + /* + * If the busiest group is imbalanced the below checks don't + * work because they assumes all things are equal, which typically + * isn't true due to cpus_allowed constraints and the like. + */ + if (sds.group_imb) + goto force_balance; + + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && + !sds.busiest_has_capacity) + goto force_balance; + + /* + * If the local group is more busy than the selected busiest group + * don't try and pull any tasks. + */ + if (sds.this_load >= sds.max_load){ + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_LARGER_THAN); + goto out_balanced; + } + + /* + * Don't pull any tasks if this group is already above the domain + * average load. + */ + if (sds.this_load >= sds.avg_load){ + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_LARGER_THAN); + goto out_balanced; + } + + if (env->idle == CPU_IDLE) { + /* + * This cpu is idle. If the busiest group load doesn't + * have more tasks than the number of available cpu's and + * there is no imbalance between this and busiest group + * wrt to idle cpu's, it is balanced. + */ + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && + sds.busiest_nr_running <= sds.busiest_group_weight) + goto out_balanced; + } else { + /* + * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use + * imbalance_pct to be conservative. + */ + if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load){ + mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_CHECK_FAIL); + goto out_balanced; + } + } + +force_balance: + /* Looks like there is an imbalance. Compute it */ + calculate_imbalance(env, &sds); + return sds.busiest; + +out_balanced: +ret: + env->imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static struct rq *find_busiest_queue(struct lb_env *env, + struct sched_group *group) +{ + struct rq *busiest = NULL, *rq; + unsigned long max_load = 0; + int i; + + for_each_cpu(i, sched_group_cpus(group)) { + unsigned long power = power_of(i); + unsigned long capacity = DIV_ROUND_CLOSEST(power, + SCHED_POWER_SCALE); + unsigned long wl; + + if (!capacity) + capacity = fix_small_capacity(env->sd, group); + + if (!cpumask_test_cpu(i, env->cpus)) + continue; + + rq = cpu_rq(i); + wl = weighted_cpuload(i); + + /* + * When comparing with imbalance, use weighted_cpuload() + * which is not scaled with the cpu power. + */ + if (capacity && rq->nr_running == 1 && wl > env->imbalance) + continue; + + /* + * For the load comparisons with the other cpu's, consider + * the weighted_cpuload() scaled with the cpu power, so that + * the load can be moved away from the cpu that is potentially + * running at a lower capacity. + */ + wl = (wl * SCHED_POWER_SCALE) / power; + + if (wl > max_load) { + max_load = wl; + busiest = rq; + } + } + + return busiest; +} + +/* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL 512 + +/* Working cpumask for load_balance and load_balance_newidle. */ +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); + +static int need_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + if (env->idle == CPU_NEWLY_IDLE) { + + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * higher numbered CPUs in order to pack all tasks in the + * lowest numbered CPUs. + */ + if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) + return 1; + } + + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); +} + +static int active_load_balance_cpu_stop(void *data); + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + */ +static int load_balance(int this_cpu, struct rq *this_rq, + struct sched_domain *sd, enum cpu_idle_type idle, + int *balance) +{ + int ld_moved, cur_ld_moved, active_balance = 0; + struct sched_group *group; + struct rq *busiest; + unsigned long flags; + struct cpumask *cpus = __get_cpu_var(load_balance_mask); + + struct lb_env env = { + .sd = sd, + .dst_cpu = this_cpu, + .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), + .idle = idle, + .loop_break = sched_nr_migrate_break, + .cpus = cpus, +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + .fail_reason= MT_LBPROF_NO_TRIGGER, +#endif + }; + + /* + * For NEWLY_IDLE load_balancing, we don't need to consider + * other cpus in our group + */ + if (idle == CPU_NEWLY_IDLE) + env.dst_grpmask = NULL; + + cpumask_copy(cpus, cpu_active_mask); + + schedstat_inc(sd, lb_count[idle]); + +redo: + group = find_busiest_group(&env, balance); + + if (*balance == 0) + goto out_balanced; + + if (!group) { + schedstat_inc(sd, lb_nobusyg[idle]); + if(mt_lbprof_test(env.fail_reason, MT_LBPROF_HISTORY)){ + int tmp_cpu; + for_each_cpu(tmp_cpu, cpu_possible_mask){ + if (tmp_cpu == this_rq->cpu) + continue; + mt_lbprof_update_state(tmp_cpu, MT_LBPROF_BALANCE_FAIL_STATE); + } + } + goto out_balanced; + } + + busiest = find_busiest_queue(&env, group); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[idle]); + mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_NOBUSYQ); + goto out_balanced; + } + +#ifdef CONFIG_HMP_LAZY_BALANCE + +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + if (PA_ENABLE && LB_ENABLE) { +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + + if (per_cpu(sd_pack_buddy, this_cpu) == busiest->cpu && !is_buddy_busy(per_cpu(sd_pack_buddy, this_cpu))) { + +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[this_cpu][busiest->cpu]++; + +#ifdef CONFIG_HMP_TRACER + trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_BALANCE_FORM_CPUX_TO_CPUY, 0, this_cpu, busiest->cpu); +#endif /* CONFIG_HMP_TRACER */ + +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; + } + +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + } +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + +#endif /* CONFIG_HMP_LAZY_BALANCE */ + + BUG_ON(busiest == env.dst_rq); + + schedstat_add(sd, lb_imbalance[idle], env.imbalance); + + ld_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. ld_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + env.flags |= LBF_ALL_PINNED; + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); +#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT + env.mt_check_cache_in_idle = 1; +#endif + + update_h_load(env.src_cpu); +more_balance: + local_irq_save(flags); + double_rq_lock(env.dst_rq, busiest); +#ifdef CONFIG_MTK_SCHED_CMP + env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running); + mt_sched_printf(sched_cmp, "1 busiest->nr_running=%d src=%d, dst=%d, cpus_share_cache=%d loop_max=%d loop=%d imbalance=%ld", + busiest->nr_running, env.src_cpu, env.dst_cpu, cpus_share_cache(env.src_cpu, env.dst_cpu), env.loop_max, env.loop, env.imbalance); +#endif /* CONFIG_MTK_SCHED_CMP */ + /* + * cur_ld_moved - load moved in current iteration + * ld_moved - cumulative load moved across iterations + */ +#ifdef CONFIG_MTK_SCHED_CMP + if (!cpus_share_cache(env.src_cpu, env.dst_cpu)) + cur_ld_moved = cmp_move_tasks(sd, &env); + else + cur_ld_moved = move_tasks(&env); +#else /* !CONFIG_MTK_SCHED_CMP */ + cur_ld_moved = move_tasks(&env); +#endif /* CONFIG_MTK_SCHED_CMP */ + ld_moved += cur_ld_moved; + double_rq_unlock(env.dst_rq, busiest); + local_irq_restore(flags); + + /* + * some other cpu did the load balance for us. + */ + if (cur_ld_moved && env.dst_cpu != smp_processor_id()) + resched_cpu(env.dst_cpu); + + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + + /* + * Revisit (affine) tasks on src_cpu that couldn't be moved to + * us and move them to an alternate dst_cpu in our sched_group + * where they can run. The upper limit on how many times we + * iterate on same src_cpu is dependent on number of cpus in our + * sched_group. + * + * This changes load balance semantics a bit on who can move + * load to a given_cpu. In addition to the given_cpu itself + * (or a ilb_cpu acting on its behalf where given_cpu is + * nohz-idle), we now have balance_cpu in a position to move + * load to given_cpu. In rare situations, this may cause + * conflicts (balance_cpu and given_cpu/ilb_cpu deciding + * _independently_ and at _same_ time to move some load to + * given_cpu) causing exceess load to be moved to given_cpu. + * This however should not happen so much in practice and + * moreover subsequent load balance cycles should correct the + * excess load moved. + */ + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + + env.dst_rq = cpu_rq(env.new_dst_cpu); + env.dst_cpu = env.new_dst_cpu; + env.flags &= ~LBF_SOME_PINNED; + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + + /* + * Go back to "more_balance" rather than "redo" since we + * need to continue with same src_cpu. + */ + goto more_balance; + } + + /* All tasks on this runqueue were pinned by CPU affinity */ + if (unlikely(env.flags & LBF_ALL_PINNED)) { + mt_lbprof_update_state(busiest->cpu, MT_LBPROF_ALLPINNED); + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + goto redo; + } + goto out_balanced; + } + +#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT + /* when move tasks fil, force migration no matter cache-hot */ + /* use mt_check_cache_in_idle */ + if (!ld_moved && ((CPU_NEWLY_IDLE == idle) || (CPU_IDLE == idle) ) ) { +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + mt_lbprof_stat_set(env.fail_reason, MT_LBPROF_DO_LB); +#endif + env.mt_check_cache_in_idle = 0; + env.loop = 0; + local_irq_save(flags); + double_rq_lock(env.dst_rq, busiest); +#ifdef CONFIG_MTK_SCHED_CMP + env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running); + mt_sched_printf(sched_log, "2 env.loop_max=%d, busiest->nr_running=%d", + env.loop_max, busiest->nr_running); +#endif /* CONFIG_MTK_SCHED_CMP */ + if (!env.loop) + update_h_load(env.src_cpu); +#ifdef CONFIG_MTK_SCHED_CMP_TGS + if (!cpus_share_cache(env.src_cpu, env.dst_cpu)) + ld_moved = cmp_move_tasks(sd, &env); + else{ + ld_moved = move_tasks(&env); + } +#else /* !CONFIG_MTK_SCHED_CMP_TGS */ + ld_moved = move_tasks(&env); +#endif /* CONFIG_MTK_SCHED_CMP_TGS */ + double_rq_unlock(env.dst_rq, busiest); + local_irq_restore(flags); + + /* + * some other cpu did the load balance for us. + */ + if (ld_moved && this_cpu != smp_processor_id()) + resched_cpu(this_cpu); + } +#endif + } + + if (!ld_moved) { + schedstat_inc(sd, lb_failed[idle]); + mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_FAILED); + if ( mt_lbprof_test(env.fail_reason, MT_LBPROF_AFFINITY) ) { + mt_lbprof_update_state(busiest->cpu, MT_LBPROF_FAILURE_STATE); + }else if ( mt_lbprof_test(env.fail_reason, MT_LBPROF_CACHEHOT) ) { + mt_lbprof_update_state(busiest->cpu, MT_LBPROF_FAILURE_STATE); + } + + /* + * Increment the failure counter only on periodic balance. + * We do not want newidle balance, which can be very + * frequent, pollute the failure counter causing + * excessive cache_hot migrations and active balances. + */ + if (idle != CPU_NEWLY_IDLE) + sd->nr_balance_failed++; + mt_lbprof_stat_inc(sd, mt_lbprof_nr_balance_failed); + + if (need_active_balance(&env)) { + raw_spin_lock_irqsave(&busiest->lock, flags); + + /* don't kick the active_load_balance_cpu_stop, + * if the curr task on busiest cpu can't be + * moved to this_cpu + */ + if (!cpumask_test_cpu(this_cpu, + tsk_cpus_allowed(busiest->curr))) { + raw_spin_unlock_irqrestore(&busiest->lock, + flags); + env.flags |= LBF_ALL_PINNED; + goto out_one_pinned; + } + + /* + * ->active_balance synchronizes accesses to + * ->active_balance_work. Once set, it's cleared + * only after active load balance is finished. + */ + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + raw_spin_unlock_irqrestore(&busiest->lock, flags); + + if (active_balance) { + stop_one_cpu_nowait(cpu_of(busiest), + active_load_balance_cpu_stop, busiest, + &busiest->active_balance_work); + } + + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries+1; + } + } else + sd->nr_balance_failed = 0; + + if (likely(!active_balance)) { + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + } else { + /* + * If we've begun active balancing, start to back off. This + * case may not be covered by the all_pinned logic if there + * is only 1 task on the busy runqueue (because we don't call + * move_tasks). + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + } + + goto out; + +out_balanced: + schedstat_inc(sd, lb_balanced[idle]); + + sd->nr_balance_failed = 0; + mt_lbprof_stat_set(sd->mt_lbprof_nr_balance_failed, 0); + +out_one_pinned: + /* tune up the balancing interval */ + if (((env.flags & LBF_ALL_PINNED) && + sd->balance_interval < MAX_PINNED_INTERVAL) || + (sd->balance_interval < sd->max_interval)) + sd->balance_interval *= 2; + + ld_moved = 0; +out: + if (ld_moved){ + mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_SUCCESS); + mt_lbprof_stat_set(sd->mt_lbprof_nr_balance_failed, 0); + } + +#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER + if( CPU_NEWLY_IDLE == idle){ + char strings[128]=""; + snprintf(strings, 128, "%d:idle balance:%d:0x%x ", this_cpu, ld_moved, env.fail_reason); + mt_lbprof_rqinfo(strings); + trace_sched_lbprof_log(strings); + }else{ + char strings[128]=""; + snprintf(strings, 128, "%d:periodic balance:%d:0x%x ", this_cpu, ld_moved, env.fail_reason); + mt_lbprof_rqinfo(strings); + trace_sched_lbprof_log(strings); + } +#endif + + return ld_moved; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +void idle_balance(int this_cpu, struct rq *this_rq) +{ + struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + HZ; +#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) && defined(CONFIG_LOCAL_TIMERS) + unsigned long counter = 0; +#endif + + this_rq->idle_stamp = this_rq->clock; + + mt_lbprof_update_state_has_lock(this_cpu, MT_LBPROF_UPDATE_STATE); +#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) && defined(CONFIG_LOCAL_TIMERS) + counter = localtimer_get_counter(); + if ( counter >= 260000 ) // 20ms + goto must_do; + if ( time_before(jiffies + 2, this_rq->next_balance) ) // 20ms + goto must_do; +#endif + + if (this_rq->avg_idle < sysctl_sched_migration_cost){ + + mt_lbprof_update_state_has_lock(this_cpu, MT_LBPROF_ALLOW_UNBLANCE_STATE); +#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) && defined(CONFIG_LOCAL_TIMERS) + mt_sched_printf(sched_lb, "%d:idle balance bypass: %llu %lu", + this_cpu, this_rq->avg_idle, counter); +#else + mt_sched_printf(sched_lb, "%d:idle balance bypass: %llu", + this_cpu, this_rq->avg_idle); +#endif + + return; + } + +#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) && defined(CONFIG_LOCAL_TIMERS) + must_do: +#endif + + /* + * Drop the rq->lock, but keep IRQ/preempt disabled. + */ + raw_spin_unlock(&this_rq->lock); + + mt_lbprof_update_status(); + update_blocked_averages(this_cpu); + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + unsigned long interval; + int balance = 1; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + if (sd->flags & SD_BALANCE_NEWIDLE) { + /* If we've pulled tasks over stop searching: */ + pulled_task = load_balance(this_cpu, this_rq, + sd, CPU_NEWLY_IDLE, &balance); + } + + interval = msecs_to_jiffies(sd->balance_interval); + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + if (pulled_task) { + this_rq->idle_stamp = 0; + break; + } + } + rcu_read_unlock(); + + raw_spin_lock(&this_rq->lock); + + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { + /* + * We are going idle. next_balance may be set based on + * a busy processor. So reset next_balance. + */ + this_rq->next_balance = next_balance; + } +} + +/* + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. + */ +static int active_load_balance_cpu_stop(void *data) +{ + struct rq *busiest_rq = data; + int busiest_cpu = cpu_of(busiest_rq); + int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct sched_domain *sd; + + raw_spin_lock_irq(&busiest_rq->lock); + + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) + goto out_unlock; + + /* Is there any task to move? */ + if (busiest_rq->nr_running <= 1) + goto out_unlock; + + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + + /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); + for_each_domain(target_cpu, sd) { + if ((sd->flags & SD_LOAD_BALANCE) && + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; + + schedstat_inc(sd, alb_count); + + if (move_one_task(&env)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + rcu_read_unlock(); + double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * idle load balancing details + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. + */ +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; + + +static inline int find_new_ilb(int call_cpu) +{ +#ifdef CONFIG_HMP_PACK_SMALL_TASK + +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + + struct sched_domain *sd; + + int ilb_new = nr_cpu_ids; + + int ilb_return = 0; + + int ilb = cpumask_first(nohz.idle_cpus_mask); + + + if(PA_ENABLE) + { + int buddy = per_cpu(sd_pack_buddy, call_cpu); + + /* + * If we have a pack buddy CPU, we try to run load balance on a CPU + * that is close to the buddy. + */ + if (buddy != -1) + for_each_domain(buddy, sd) { + if (sd->flags & SD_SHARE_CPUPOWER) + continue; + + ilb_new = cpumask_first_and(sched_domain_span(sd), + nohz.idle_cpus_mask); + + if (ilb_new < nr_cpu_ids) + break; + + } + } + + if (ilb < nr_cpu_ids && idle_cpu(ilb)) { + ilb_return = 1; + } + + if (ilb_new < nr_cpu_ids) { + if (idle_cpu(ilb_new)) { + if(PA_ENABLE && ilb_return && ilb_new != ilb) { + AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[call_cpu][ilb]++; + +#ifdef CONFIG_HMP_TRACER + trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_WAKE_UP_FORM_CPUX_TO_CPUY, 0, call_cpu, ilb); +#endif /* CONFIG_HMP_TRACER */ + + } + return ilb_new; + } + } + + if(ilb_return) { + return ilb; + } + + return nr_cpu_ids; + +#else /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + + struct sched_domain *sd; + int ilb = cpumask_first(nohz.idle_cpus_mask); + int buddy = per_cpu(sd_pack_buddy, call_cpu); + + /* + * If we have a pack buddy CPU, we try to run load balance on a CPU + * that is close to the buddy. + */ + if (buddy != -1) + for_each_domain(buddy, sd) { + if (sd->flags & SD_SHARE_CPUPOWER) + continue; + + ilb = cpumask_first_and(sched_domain_span(sd), + nohz.idle_cpus_mask); + + if (ilb < nr_cpu_ids) + break; + } + + if (ilb < nr_cpu_ids && idle_cpu(ilb)) + return ilb; + + return nr_cpu_ids; + +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + +#else /* CONFIG_HMP_PACK_SMALL_TASK */ + + int ilb = cpumask_first(nohz.idle_cpus_mask); +#ifdef CONFIG_MTK_SCHED_CMP_TGS + /* Find nohz balancing to occur in the same cluster firstly */ + int new_ilb; + struct cpumask tmp; + //Find idle cpu with online one + get_cluster_cpus(&tmp, get_cluster_id(call_cpu), true); + new_ilb = cpumask_first_and(nohz.idle_cpus_mask, &tmp); + if (new_ilb < nr_cpu_ids && idle_cpu(new_ilb)) + { +#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER + if(new_ilb != ilb) + { + mt_sched_printf(sched_pa, "[PA]find_new_ilb(cpu%x), new_ilb = %d, ilb = %d\n", call_cpu, new_ilb, ilb); + AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[call_cpu][ilb]++; + } +#endif + return new_ilb; + } +#endif /* CONFIG_MTK_SCHED_CMP_TGS */ + + if (ilb < nr_cpu_ids && idle_cpu(ilb)) + return ilb; + + return nr_cpu_ids; + +#endif /* CONFIG_HMP_PACK_SMALL_TASK */ + +} + + +/* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void nohz_balancer_kick(int cpu) +{ + int ilb_cpu; + + nohz.next_balance++; + + ilb_cpu = find_new_ilb(cpu); + + if (ilb_cpu >= nr_cpu_ids) + return; + + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) + return; + /* + * Use smp_send_reschedule() instead of resched_cpu(). + * This way we generate a sched IPI on the target cpu which + * is idle. And the softirq performing nohz idle load balance + * will be run before returning from the IPI. + */ + smp_send_reschedule(ilb_cpu); + return; +} + +static inline void nohz_balance_exit_idle(int cpu) +{ + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + } +} + +static inline void set_cpu_sd_state_busy(void) +{ + struct sched_domain *sd; + int cpu = smp_processor_id(); + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + for (; sd; sd = sd->parent) + atomic_inc(&sd->groups->sgp->nr_busy_cpus); +unlock: + rcu_read_unlock(); +} + +void set_cpu_sd_state_idle(void) +{ + struct sched_domain *sd; + int cpu = smp_processor_id(); + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + for (; sd; sd = sd->parent) + atomic_dec(&sd->groups->sgp->nr_busy_cpus); +unlock: + rcu_read_unlock(); +} + +/* + * This routine will record that the cpu is going idle with tick stopped. + * This info will be used in performing idle load balancing in the future. + */ +void nohz_balance_enter_idle(int cpu) +{ + /* + * If this cpu is going down, then nothing needs to be done. + */ + if (!cpu_active(cpu)) + return; + + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) + return; + + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + atomic_inc(&nohz.nr_cpus); + set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); +} + +static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DYING: + nohz_balance_exit_idle(smp_processor_id()); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} +#endif + +static DEFINE_SPINLOCK(balancing); + +/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +void update_max_interval(void) +{ + max_load_balance_interval = HZ*num_online_cpus()/10; +} + +/* + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in init_sched_domains. + */ +static void rebalance_domains(int cpu, enum cpu_idle_type idle) +{ + int balance = 1; + struct rq *rq = cpu_rq(cpu); + unsigned long interval; + struct sched_domain *sd; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; + int need_serialize; + + update_blocked_averages(cpu); + + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + interval = sd->balance_interval; + if (idle != CPU_IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + + need_serialize = sd->flags & SD_SERIALIZE; + + if (need_serialize) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(cpu, rq, sd, idle, &balance)) { + /* + * The LBF_SOME_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. + */ + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; + } + sd->last_balance = jiffies; + } + if (need_serialize) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) { + next_balance = sd->last_balance + interval; + update_next_balance = 1; + } + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; + } + rcu_read_unlock(); + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +{ + struct rq *this_rq = cpu_rq(this_cpu); + struct rq *rq; + int balance_cpu; + + if (idle != CPU_IDLE || + !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) + goto end; + + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) + continue; + + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) + break; + + rq = cpu_rq(balance_cpu); + + raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + raw_spin_unlock_irq(&rq->lock); + + rebalance_domains(balance_cpu, CPU_IDLE); + + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } + nohz.next_balance = this_rq->next_balance; +end: + clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); +} + +/* + * Current heuristic for kicking the idle load balancer in the presence + * of an idle cpu is the system. + * - This rq has more than one task. + * - At any scheduler domain level, this cpu's scheduler group has multiple + * busy cpu's exceeding the group's power. + * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler + * domain span are idle. + */ +static inline int nohz_kick_needed(struct rq *rq, int cpu) +{ + unsigned long now = jiffies; + struct sched_domain *sd; + + if (unlikely(idle_cpu(cpu))) + return 0; + + /* + * We may be recently in ticked or tickless idle mode. At the first + * busy tick after returning from idle, we will update the busy stats. + */ + set_cpu_sd_state_busy(); + nohz_balance_exit_idle(cpu); + + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing. + */ + if (likely(!atomic_read(&nohz.nr_cpus))) + return 0; + + if (time_before(now, nohz.next_balance)) + return 0; + +#ifdef CONFIG_SCHED_HMP + /* + * Bail out if there are no nohz CPUs in our + * HMP domain, since we will move tasks between + * domains through wakeup and force balancing + * as necessary based upon task load. + */ + if (cpumask_first_and(nohz.idle_cpus_mask, + &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids) + return 0; +#endif + + if (rq->nr_running >= 2) + goto need_kick; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + struct sched_group *sg = sd->groups; + struct sched_group_power *sgp = sg->sgp; + int nr_busy = atomic_read(&sgp->nr_busy_cpus); + + if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) + goto need_kick_unlock; + + if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight + && (cpumask_first_and(nohz.idle_cpus_mask, + sched_domain_span(sd)) < cpu)) + goto need_kick_unlock; + + if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) + break; + } + rcu_read_unlock(); + return 0; + +need_kick_unlock: + rcu_read_unlock(); +need_kick: + return 1; +} +#else +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +#endif + +#ifdef CONFIG_SCHED_HMP +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + +/* + * Heterogenous Multi-Processor (HMP) - Declaration and Useful Macro + */ + +/* Function Declaration */ +static int hmp_up_stable(int cpu); +static int hmp_down_stable(int cpu); +static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se, + struct clb_env *clbenv); +static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se, + struct clb_env *clbenv); + +#define hmp_caller_is_gb(caller) ((HMP_GB == caller)?1:0) + +#define hmp_cpu_is_fast(cpu) cpumask_test_cpu(cpu,&hmp_fast_cpu_mask) +#define hmp_cpu_is_slow(cpu) cpumask_test_cpu(cpu,&hmp_slow_cpu_mask) +#define hmp_cpu_stable(cpu) (hmp_cpu_is_fast(cpu)? \ + hmp_up_stable(cpu):hmp_down_stable(cpu)) + +#define hmp_inc(v) ((v) + 1) +#define hmp_dec(v) ((v) - 1) +#define hmp_pos(v) ((v) < (0) ? (0) : (v)) + +#define task_created(f) ((SD_BALANCE_EXEC == f || SD_BALANCE_FORK == f)?1:0) +#define task_cpus_allowed(mask,p) cpumask_intersects(mask,tsk_cpus_allowed(p)) +#define task_slow_cpu_allowed(p) task_cpus_allowed(&hmp_slow_cpu_mask,p) +#define task_fast_cpu_allowed(p) task_cpus_allowed(&hmp_fast_cpu_mask,p) + +/* + * Heterogenous Multi-Processor (HMP) - Utility Function + */ + +/* + * These functions add next up/down migration delay that prevents the task from + * doing another migration in the same direction until the delay has expired. + */ +static int hmp_up_stable(int cpu) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + u64 now = cfs_rq_clock_task(cfs_rq); + if (((now - hmp_last_up_migration(cpu)) >> 10) < hmp_next_up_threshold) + return 0; + return 1; +} + +static int hmp_down_stable(int cpu) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + u64 now = cfs_rq_clock_task(cfs_rq); + if (((now - hmp_last_down_migration(cpu)) >> 10) < hmp_next_down_threshold) + return 0; + return 1; +} + +/* Select the most appropriate CPU from hmp cluster */ +static unsigned int hmp_select_cpu(unsigned int caller, struct task_struct *p, + struct cpumask *mask, int prev) +{ + int curr = 0; + int target = NR_CPUS; + unsigned long curr_wload = 0; + unsigned long target_wload = 0; + struct cpumask srcp; + cpumask_and(&srcp, cpu_online_mask, mask); + target = cpumask_any_and(&srcp, tsk_cpus_allowed(p)); + if (NR_CPUS == target) + goto out; + + /* + * RT class is taken into account because CPU load is multiplied + * by the total number of CPU runnable tasks that includes RT tasks. + */ + target_wload = hmp_inc(cfs_load(target)); + target_wload += cfs_pending_load(target); + target_wload *= rq_length(target); + for_each_cpu(curr, mask) { + /* Check CPU status and task affinity */ + if(!cpu_online(curr) || !cpumask_test_cpu(curr, tsk_cpus_allowed(p))) + continue; + + /* For global load balancing, unstable CPU will be bypassed */ + if(hmp_caller_is_gb(caller) && !hmp_cpu_stable(curr)) + continue; + + curr_wload = hmp_inc(cfs_load(curr)); + curr_wload += cfs_pending_load(curr); + curr_wload *= rq_length(curr); + if(curr_wload < target_wload) { + target_wload = curr_wload; + target = curr; + } else if(curr_wload == target_wload && curr == prev) { + target = curr; + } + } + +out: + return target; +} + +/* + * Heterogenous Multi-Processor (HMP) - Task Runqueue Selection + */ + +/* This function enhances the original task selection function */ +static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p, + int prev_cpu, int new_cpu) +{ +#ifdef CONFIG_HMP_TASK_ASSIGNMENT + int step = 0; + struct sched_entity *se = &p->se; + int B_target = NR_CPUS; + int L_target = NR_CPUS; + struct clb_env clbenv; + +#ifdef CONFIG_HMP_TRACER + int cpu = 0; + for_each_online_cpu(cpu) + trace_sched_cfs_runnable_load(cpu,cfs_load(cpu),cfs_length(cpu)); +#endif + + // error handling + if (prev_cpu >= NR_CPUS) + return new_cpu; + + /* + * Skip all the checks if only one CPU is online. + * Otherwise, select the most appropriate CPU from cluster. + */ + if (num_online_cpus() == 1) + goto out; + B_target = hmp_select_cpu(HMP_SELECT_RQ,p,&hmp_fast_cpu_mask,prev_cpu); + L_target = hmp_select_cpu(HMP_SELECT_RQ,p,&hmp_slow_cpu_mask,prev_cpu); + + /* + * Only one cluster exists or only one cluster is allowed for this task + * Case 1: return the runqueue whose load is minimum + * Case 2: return original CFS runqueue selection result + */ +#ifdef CONFIG_HMP_DISCARD_CFS_SELECTION_RESULT + if(NR_CPUS == B_target && NR_CPUS == L_target) + goto out; + if(NR_CPUS == B_target) + goto select_slow; + if(NR_CPUS == L_target) + goto select_fast; +#else + if(NR_CPUS == B_target || NR_CPUS == L_target) + goto out; +#endif + + /* + * Two clusters exist and both clusters are allowed for this task + * Step 1: Move newly created task to the cpu where no tasks are running + * Step 2: Migrate heavy-load task to big + * Step 3: Migrate light-load task to LITTLE + * Step 4: Make sure the task stays in its previous hmp domain + */ + step = 1; + if (task_created(sd_flag) && !task_low_priority(p->prio)) { + if (!rq_length(B_target)) + goto select_fast; + if (!rq_length(L_target)) + goto select_slow; + } + memset(&clbenv, 0, sizeof(clbenv)); + clbenv.flags |= HMP_SELECT_RQ; + clbenv.lcpus = &hmp_slow_cpu_mask; + clbenv.bcpus = &hmp_fast_cpu_mask; + clbenv.ltarget = L_target; + clbenv.btarget = B_target; + sched_update_clbstats(&clbenv); + step = 2; + if (hmp_up_migration(L_target, &B_target, se, &clbenv)) + goto select_fast; + step = 3; + if (hmp_down_migration(B_target, &L_target, se, &clbenv)) + goto select_slow; + step = 4; + if (hmp_cpu_is_slow(prev_cpu)) + goto select_slow; + goto select_fast; + +select_fast: + new_cpu = B_target; + goto out; +select_slow: + new_cpu = L_target; + goto out; + +out: + + // it happens when num_online_cpus=1 + if (new_cpu >= nr_cpu_ids) + { + //BUG_ON(1); + new_cpu = prev_cpu; + } + + cfs_nr_pending(new_cpu)++; + cfs_pending_load(new_cpu) += se_load(se); +#ifdef CONFIG_HMP_TRACER + trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg); + trace_sched_hmp_select_task_rq(p,step,sd_flag,prev_cpu,new_cpu, + se_load(se),&clbenv.bstats,&clbenv.lstats); +#endif +#ifdef CONFIG_MET_SCHED_HMP + HmpLoad(clbenv.bstats.load_avg, clbenv.lstats.load_avg); +#endif +#endif /* CONFIG_HMP_TASK_ASSIGNMENT */ + return new_cpu; +} + +/* + * Heterogenous Multi-Processor (HMP) - Task Dynamic Migration Threshold + */ + +/* + * If the workload between clusters is not balanced, adjust migration + * threshold in an attempt to move task to the cluster where the workload + * is not heavy + */ + +/* + * According to ARM's cpu_efficiency table, the computing power of CA15 and + * CA7 are 3891 and 2048 respectively. Thus, we assume big has twice the + * computing power of LITTLE + */ + +#define HMP_RATIO(v) ((v)*17/10) + +#define hmp_fast_cpu_has_spare_cycles(B,cpu_load) (cpu_load < \ + (HMP_RATIO(B->cpu_capacity) - (B->cpu_capacity >> 2))) + +#define hmp_task_fast_cpu_afford(B,se,cpu) (B->acap > 0 \ + && hmp_fast_cpu_has_spare_cycles(B,se_load(se) + cfs_load(cpu))) + +#define hmp_fast_cpu_oversubscribed(caller,B,se,cpu) \ + (hmp_caller_is_gb(caller)? \ + !hmp_fast_cpu_has_spare_cycles(B,cfs_load(cpu)): \ + !hmp_task_fast_cpu_afford(B,se,cpu)) + +#define hmp_task_slow_cpu_afford(L,se) \ + (L->acap > 0 && L->acap >= se_load(se)) + +/* Macro used by low-priorty task filter */ +#define hmp_low_prio_task_up_rejected(p,B,L) \ + (task_low_priority(p->prio) && \ + (B->ntask >= B->ncpu || 0 != L->nr_normal_prio_task) && \ + (p->se.avg.load_avg_ratio < 800)) + +#define hmp_low_prio_task_down_allowed(p,B,L) \ + (task_low_priority(p->prio) && !B->nr_dequeuing_low_prio && \ + B->ntask >= B->ncpu && 0 != L->nr_normal_prio_task && \ + (p->se.avg.load_avg_ratio < 800)) + +/* Migration check result */ +#define HMP_BIG_NOT_OVERSUBSCRIBED (0x01) +#define HMP_BIG_CAPACITY_INSUFFICIENT (0x02) +#define HMP_LITTLE_CAPACITY_INSUFFICIENT (0x04) +#define HMP_LOW_PRIORITY_FILTER (0x08) +#define HMP_BIG_BUSY_LITTLE_IDLE (0x10) +#define HMP_BIG_IDLE (0x20) +#define HMP_MIGRATION_APPROVED (0x100) +#define HMP_TASK_UP_MIGRATION (0x200) +#define HMP_TASK_DOWN_MIGRATION (0x400) + +/* Migration statistics */ +#ifdef CONFIG_HMP_TRACER +struct hmp_statisic hmp_stats; +#endif + +static inline void hmp_dynamic_threshold(struct clb_env *clbenv) +{ + struct clb_stats *L = &clbenv->lstats; + struct clb_stats *B = &clbenv->bstats; + unsigned int hmp_threshold_diff = hmp_up_threshold - hmp_down_threshold; + unsigned int B_normalized_acap = hmp_pos(HMP_RATIO(B->scaled_acap)); + unsigned int B_normalized_atask = hmp_pos(HMP_RATIO(B->scaled_atask)); + unsigned int L_normalized_acap = hmp_pos(L->scaled_acap); + unsigned int L_normalized_atask = hmp_pos(L->scaled_atask); + +#ifdef CONFIG_HMP_DYNAMIC_THRESHOLD + L->threshold = hmp_threshold_diff; + L->threshold *= hmp_inc(L_normalized_acap) * hmp_inc(L_normalized_atask); + L->threshold /= hmp_inc(B_normalized_acap + L_normalized_acap); + L->threshold /= hmp_inc(B_normalized_atask + L_normalized_atask); + L->threshold = hmp_down_threshold + L->threshold; + + B->threshold = hmp_threshold_diff; + B->threshold *= hmp_inc(B_normalized_acap) * hmp_inc(B_normalized_atask); + B->threshold /= hmp_inc(B_normalized_acap + L_normalized_acap); + B->threshold /= hmp_inc(B_normalized_atask + L_normalized_atask); + B->threshold = hmp_up_threshold - B->threshold; +#else /* !CONFIG_HMP_DYNAMIC_THRESHOLD */ + clbenv->lstats.threshold = hmp_down_threshold; // down threshold + clbenv->bstats.threshold = hmp_up_threshold; // up threshold +#endif /* CONFIG_HMP_DYNAMIC_THRESHOLD */ + + mt_sched_printf("[%s]\tup/dl:%4d/%4d bcpu(%d):%d/%d, lcpu(%d):%d/%d\n", __func__, + B->threshold, L->threshold, + clbenv->btarget, clbenv->bstats.cpu_capacity, clbenv->bstats.cpu_power, + clbenv->ltarget, clbenv->lstats.cpu_capacity, clbenv->lstats.cpu_power); +} + +/* + * Check whether this task should be migrated to big + * Briefly summarize the flow as below; + * 1) Migration stabilizing + * 1.5) Keep all cpu busy + * 2) Filter low-priorty task + * 3) Check CPU capacity + * 4) Check dynamic migration threshold + */ +static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se, + struct clb_env *clbenv) +{ + struct task_struct *p = task_of(se); + struct clb_stats *L, *B; + struct mcheck *check; + int curr_cpu = cpu; + unsigned int caller = clbenv->flags; + + L = &clbenv->lstats; + B = &clbenv->bstats; + check = &clbenv->mcheck; + + check->status = clbenv->flags; + check->status |= HMP_TASK_UP_MIGRATION; + check->result = 0; + + /* + * No migration is needed if + * 1) There is only one cluster + * 2) Task is already in big cluster + * 3) It violates task affinity + */ + if (!L->ncpu || !B->ncpu + || cpumask_test_cpu(curr_cpu, clbenv->bcpus) + || !cpumask_intersects(clbenv->bcpus, tsk_cpus_allowed(p))) + goto out; + + /* + * [1] Migration stabilizing + * Let the task load settle before doing another up migration. + * It can prevent a bunch of tasks from migrating to a unstable CPU. + */ + if (!hmp_up_stable(*target_cpu)) + goto out; + + /* [2] Filter low-priorty task */ +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + if (hmp_low_prio_task_up_rejected(p,B,L)) { + check->status |= HMP_LOW_PRIORITY_FILTER; + goto trace; + } +#endif + + // [2.5]if big is idle, just go to big + if (rq_length(*target_cpu)==0) + { + check->status |= HMP_BIG_IDLE; + check->status |= HMP_MIGRATION_APPROVED; + check->result = 1; + goto trace; + } + + /* + * [3] Check CPU capacity + * Forbid up-migration if big CPU can't handle this task + */ + if (!hmp_task_fast_cpu_afford(B,se,*target_cpu)) { + check->status |= HMP_BIG_CAPACITY_INSUFFICIENT; + goto trace; + } + + /* + * [4] Check dynamic migration threshold + * Migrate task from LITTLE to big if load is greater than up-threshold + */ + if (se_load(se) > B->threshold) { + check->status |= HMP_MIGRATION_APPROVED; + check->result = 1; + } + +trace: +#ifdef CONFIG_HMP_TRACER + if(check->result && hmp_caller_is_gb(caller)) + hmp_stats.nr_force_up++; + trace_sched_hmp_stats(&hmp_stats); + trace_sched_dynamic_threshold(task_of(se),B->threshold,check->status, + curr_cpu,*target_cpu,se_load(se),B,L); +#endif +#ifdef CONFIG_MET_SCHED_HMP + TaskTh(B->threshold,L->threshold); + HmpStat(&hmp_stats); +#endif +out: + return check->result; +} + +/* + * Check whether this task should be migrated to LITTLE + * Briefly summarize the flow as below; + * 1) Migration stabilizing + * 1.5) Keep all cpu busy + * 2) Filter low-priorty task + * 3) Check CPU capacity + * 4) Check dynamic migration threshold + */ +static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se, + struct clb_env *clbenv) +{ + struct task_struct *p = task_of(se); + struct clb_stats *L, *B; + struct mcheck *check; + int curr_cpu = cpu; + unsigned int caller = clbenv->flags; + + L = &clbenv->lstats; + B = &clbenv->bstats; + check = &clbenv->mcheck; + + check->status = caller; + check->status |= HMP_TASK_DOWN_MIGRATION; + check->result = 0; + + /* + * No migration is needed if + * 1) There is only one cluster + * 2) Task is already in LITTLE cluster + * 3) It violates task affinity + */ + if (!L->ncpu || !B->ncpu + || cpumask_test_cpu(curr_cpu, clbenv->lcpus) + || !cpumask_intersects(clbenv->lcpus, tsk_cpus_allowed(p))) + goto out; + + /* + * [1] Migration stabilizing + * Let the task load settle before doing another down migration. + * It can prevent a bunch of tasks from migrating to a unstable CPU. + */ + if (!hmp_down_stable(*target_cpu)) + goto out; + + // [1.5]if big is busy and little is idle, just go to little + if (rq_length(*target_cpu)==0 && caller == HMP_SELECT_RQ && rq_length(curr_cpu)>0) + { + check->status |= HMP_BIG_BUSY_LITTLE_IDLE; + check->status |= HMP_MIGRATION_APPROVED; + check->result = 1; + goto trace; + } + + /* [2] Filter low-priorty task */ +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + if (hmp_low_prio_task_down_allowed(p,B,L)) { + cfs_nr_dequeuing_low_prio(curr_cpu)++; + check->status |= HMP_LOW_PRIORITY_FILTER; + check->status |= HMP_MIGRATION_APPROVED; + check->result = 1; + goto trace; + } +#endif + + /* + * [3] Check CPU capacity + * Forbid down-migration if either of the following conditions is true + * 1) big cpu is not oversubscribed (if big CPU seems to have spare + * cycles, do not force this task to run on LITTLE CPU, but + * keep it staying in its previous cluster instead) + * 2) LITTLE cpu doesn't have available capacity for this new task + */ + if (!hmp_fast_cpu_oversubscribed(caller,B,se,curr_cpu)) { + check->status |= HMP_BIG_NOT_OVERSUBSCRIBED; + goto trace; + } + + if (!hmp_task_slow_cpu_afford(L,se)) { + check->status |= HMP_LITTLE_CAPACITY_INSUFFICIENT; + goto trace; + } + + /* + * [4] Check dynamic migration threshold + * Migrate task from big to LITTLE if load ratio is less than + * or equal to down-threshold + */ + if (L->threshold >= se_load(se)) { + check->status |= HMP_MIGRATION_APPROVED; + check->result = 1; + } + +trace: +#ifdef CONFIG_HMP_TRACER + if (check->result && hmp_caller_is_gb(caller)) + hmp_stats.nr_force_down++; + trace_sched_hmp_stats(&hmp_stats); + trace_sched_dynamic_threshold(task_of(se),L->threshold,check->status, + curr_cpu,*target_cpu,se_load(se),B,L); +#endif +#ifdef CONFIG_MET_SCHED_HMP + TaskTh(B->threshold,L->threshold); + HmpStat(&hmp_stats); +#endif +out: + return check->result; +} +#else /* CONFIG_SCHED_HMP_ENHANCEMENT */ +/* Check if task should migrate to a faster cpu */ +static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + u64 now; + + if (target_cpu) + *target_cpu = NR_CPUS; + + if (hmp_cpu_is_fastest(cpu)) + return 0; + +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + /* Filter by task priority */ + if (p->prio >= hmp_up_prio) + return 0; +#endif + if (se->avg.load_avg_ratio < hmp_up_threshold) + return 0; + + /* Let the task load settle before doing another up migration */ + now = cfs_rq_clock_task(cfs_rq); + if (((now - se->avg.hmp_last_up_migration) >> 10) + < hmp_next_up_threshold) + return 0; + + /* Target domain load < 94% */ + if (hmp_domain_min_load(hmp_faster_domain(cpu), target_cpu) + > NICE_0_LOAD-64) + return 0; + + if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus, + tsk_cpus_allowed(p))) + return 1; + + return 0; +} + +/* Check if task should migrate to a slower cpu */ +static unsigned int hmp_down_migration(int cpu, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + u64 now; + + if (hmp_cpu_is_slowest(cpu)) + return 0; + +#ifdef CONFIG_SCHED_HMP_PRIO_FILTER + /* Filter by task priority */ + if ((p->prio >= hmp_up_prio) && + cpumask_intersects(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(p))) { + return 1; + } +#endif + + /* Let the task load settle before doing another down migration */ + now = cfs_rq_clock_task(cfs_rq); + if (((now - se->avg.hmp_last_down_migration) >> 10) + < hmp_next_down_threshold) + return 0; + + if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus, + tsk_cpus_allowed(p)) + && se->avg.load_avg_ratio < hmp_down_threshold) { + return 1; + } + return 0; +} +#endif /* CONFIG_SCHED_HMP_ENHANCEMENT */ + +/* + * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + * Ideally this function should be merged with can_migrate_task() to avoid + * redundant code. + */ +static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env) +{ + int tsk_cache_hot = 0; + + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed + */ + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + return 0; + } + env->flags &= ~LBF_ALL_PINNED; + + if (task_running(env->src_rq, p)) { + schedstat_inc(p, se.statistics.nr_failed_migrations_running); + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + +#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd, env->mt_check_cache_in_idle); +#else + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); +#endif + if (!tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(env->sd, lb_hot_gained[env->idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + + return 1; +} + +/* + * move_specific_task tries to move a specific task. + * Returns 1 if successful and 0 otherwise. + * Called with both runqueues locked. + */ +static int move_specific_task(struct lb_env *env, struct task_struct *pm) +{ + struct task_struct *p, *n; + + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, + env->dst_cpu)) + continue; + + if (!hmp_can_migrate_task(p, env)) + continue; + /* Check if we found the right task */ + if (p != pm) + continue; + + move_task(p, env); + /* + * Right now, this is only the third place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; + } + return 0; +} + +/* + * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to + * migrate a specific task from one runqueue to another. + * hmp_force_up_migration uses this to push a currently running task + * off a runqueue. + * Based on active_load_balance_stop_cpu and can potentially be merged. + */ +static int hmp_active_task_migration_cpu_stop(void *data) +{ + struct rq *busiest_rq = data; + struct task_struct *p = busiest_rq->migrate_task; + int busiest_cpu = cpu_of(busiest_rq); + int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct sched_domain *sd; + + raw_spin_lock_irq(&busiest_rq->lock); + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) { + goto out_unlock; + } + /* Is there any task to move? */ + if (busiest_rq->nr_running <= 1) + goto out_unlock; + /* Task has migrated meanwhile, abort forced migration */ + if (task_rq(p) != busiest_rq) + goto out_unlock; + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + + /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); + for_each_domain(target_cpu, sd) { + if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; + + schedstat_inc(sd, alb_count); + + if (move_specific_task(&env, p)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + rcu_read_unlock(); + double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; +} + +static DEFINE_SPINLOCK(hmp_force_migration); +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT +/* + * Heterogenous Multi-Processor (HMP) Global Load Balance + */ + +/* + * According to Linaro's comment, we should only check the currently running + * tasks because selecting other tasks for migration will require extensive + * book keeping. + */ +#ifdef CONFIG_HMP_GLOBAL_BALANCE +static void hmp_force_down_migration(int this_cpu) +{ + int curr_cpu, target_cpu; + struct sched_entity *se; + struct rq *target; + unsigned long flags; + unsigned int force; + struct task_struct *p; + struct clb_env clbenv; + + /* Migrate light task from big to LITTLE */ + for_each_cpu(curr_cpu, &hmp_fast_cpu_mask) { + /* Check whether CPU is online */ + if(!cpu_online(curr_cpu)) + continue; + + force = 0; + target = cpu_rq(curr_cpu); + raw_spin_lock_irqsave(&target->lock, flags); + se = target->cfs.curr; + if (!se) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + + /* Find task entity */ + if (!entity_is_task(se)) { + struct cfs_rq *cfs_rq; + cfs_rq = group_cfs_rq(se); + while (cfs_rq) { + se = cfs_rq->curr; + cfs_rq = group_cfs_rq(se); + } + } + + p = task_of(se); + target_cpu = hmp_select_cpu(HMP_GB,p,&hmp_slow_cpu_mask,-1); + if(NR_CPUS == target_cpu) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + + /* Collect cluster information */ + memset(&clbenv, 0, sizeof(clbenv)); + clbenv.flags |= HMP_GB; + clbenv.btarget = curr_cpu; + clbenv.ltarget = target_cpu; + clbenv.lcpus = &hmp_slow_cpu_mask; + clbenv.bcpus = &hmp_fast_cpu_mask; + sched_update_clbstats(&clbenv); + + /* Check migration threshold */ + if (!target->active_balance && + hmp_down_migration(curr_cpu, &target_cpu, se, &clbenv)) { + target->active_balance = 1; + target->push_cpu = target_cpu; + target->migrate_task = p; + force = 1; + trace_sched_hmp_migrate(p, target->push_cpu, 1); + hmp_next_down_delay(&p->se, target->push_cpu); + } + raw_spin_unlock_irqrestore(&target->lock, flags); + if (force) { + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } + } +} +#endif /* CONFIG_HMP_GLOBAL_BALANCE */ +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER +u32 AVOID_FORCE_UP_MIGRATION_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS]; +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + +static void hmp_force_up_migration(int this_cpu) +{ + int curr_cpu, target_cpu; + struct sched_entity *se; + struct rq *target; + unsigned long flags; + unsigned int force; + struct task_struct *p; + struct clb_env clbenv; +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + int push_cpu; +#endif + + if (!spin_trylock(&hmp_force_migration)) + return; + +#ifdef CONFIG_HMP_TRACER + for_each_online_cpu(curr_cpu) + trace_sched_cfs_runnable_load(curr_cpu,cfs_load(curr_cpu), + cfs_length(curr_cpu)); +#endif + + /* Migrate heavy task from LITTLE to big */ + for_each_cpu(curr_cpu, &hmp_slow_cpu_mask) { + /* Check whether CPU is online */ + if(!cpu_online(curr_cpu)) + continue; + + force = 0; + target = cpu_rq(curr_cpu); + raw_spin_lock_irqsave(&target->lock, flags); + se = target->cfs.curr; + if (!se) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + + /* Find task entity */ + if (!entity_is_task(se)) { + struct cfs_rq *cfs_rq; + cfs_rq = group_cfs_rq(se); + while (cfs_rq) { + se = cfs_rq->curr; + cfs_rq = group_cfs_rq(se); + } + } + + p = task_of(se); + target_cpu = hmp_select_cpu(HMP_GB,p,&hmp_fast_cpu_mask,-1); + if(NR_CPUS == target_cpu) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + + /* Collect cluster information */ + memset(&clbenv, 0, sizeof(clbenv)); + clbenv.flags |= HMP_GB; + clbenv.ltarget = curr_cpu; + clbenv.btarget = target_cpu; + clbenv.lcpus = &hmp_slow_cpu_mask; + clbenv.bcpus = &hmp_fast_cpu_mask; + sched_update_clbstats(&clbenv); + +#ifdef CONFIG_HMP_LAZY_BALANCE +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + if (PA_ENABLE && LB_ENABLE) { +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + if (is_light_task(p) && !is_buddy_busy(per_cpu(sd_pack_buddy, curr_cpu))) { +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + push_cpu = hmp_select_cpu(HMP_GB,p,&hmp_fast_cpu_mask,-1); + if (hmp_cpu_is_fast(push_cpu)) { + AVOID_FORCE_UP_MIGRATION_FROM_CPUX_TO_CPUY_COUNT[curr_cpu][push_cpu]++; +#ifdef CONFIG_HMP_TRACER + trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_FORCE_UP_FORM_CPUX_TO_CPUY, p->pid, curr_cpu, push_cpu); +#endif /* CONFIG_HMP_TRACER */ + } +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + goto out_force_up; + } +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + } +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ +#endif /* CONFIG_HMP_LAZY_BALANCE */ + + /* Check migration threshold */ + if (!target->active_balance && + hmp_up_migration(curr_cpu, &target_cpu, se, &clbenv)) { + target->active_balance = 1; + target->push_cpu = target_cpu; + target->migrate_task = p; + force = 1; + trace_sched_hmp_migrate(p, target->push_cpu, 1); + hmp_next_up_delay(&p->se, target->push_cpu); + } + +#ifdef CONFIG_HMP_LAZY_BALANCE +out_force_up: +#endif /* CONFIG_HMP_LAZY_BALANCE */ + + raw_spin_unlock_irqrestore(&target->lock, flags); + if (force) { + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } + } + +#ifdef CONFIG_HMP_GLOBAL_BALANCE + hmp_force_down_migration(this_cpu); +#endif +#ifdef CONFIG_HMP_TRACER + trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg); +#endif + spin_unlock(&hmp_force_migration); +} +#else /* CONFIG_SCHED_HMP_ENHANCEMENT */ +/* + * hmp_force_up_migration checks runqueues for tasks that need to + * be actively migrated to a faster cpu. + */ +static void hmp_force_up_migration(int this_cpu) +{ + int cpu, target_cpu; + struct sched_entity *curr; + struct rq *target; + unsigned long flags; + unsigned int force; + struct task_struct *p; + + if (!spin_trylock(&hmp_force_migration)) + return; + for_each_online_cpu(cpu) { + force = 0; + target = cpu_rq(cpu); + raw_spin_lock_irqsave(&target->lock, flags); + curr = target->cfs.curr; + if (!curr) { + raw_spin_unlock_irqrestore(&target->lock, flags); + continue; + } + if (!entity_is_task(curr)) { + struct cfs_rq *cfs_rq; + + cfs_rq = group_cfs_rq(curr); + while (cfs_rq) { + curr = cfs_rq->curr; + cfs_rq = group_cfs_rq(curr); + } + } + p = task_of(curr); + if (hmp_up_migration(cpu, &target_cpu, curr)) { + if (!target->active_balance) { + target->active_balance = 1; + target->push_cpu = target_cpu; + target->migrate_task = p; + force = 1; + trace_sched_hmp_migrate(p, target->push_cpu, 1); + hmp_next_up_delay(&p->se, target->push_cpu); + } + } + if (!force && !target->active_balance) { + /* + * For now we just check the currently running task. + * Selecting the lightest task for offloading will + * require extensive book keeping. + */ + target->push_cpu = hmp_offload_down(cpu, curr); + if (target->push_cpu < NR_CPUS) { + target->active_balance = 1; + target->migrate_task = p; + force = 1; + trace_sched_hmp_migrate(p, target->push_cpu, 2); + hmp_next_down_delay(&p->se, target->push_cpu); + } + } + raw_spin_unlock_irqrestore(&target->lock, flags); + if (force) + stop_one_cpu_nowait(cpu_of(target), + hmp_active_task_migration_cpu_stop, + target, &target->active_balance_work); + } + spin_unlock(&hmp_force_migration); +} +#endif /* CONFIG_SCHED_HMP_ENHANCEMENT */ +#else +static void hmp_force_up_migration(int this_cpu) { } +#endif /* CONFIG_SCHED_HMP */ + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ +static void run_rebalance_domains(struct softirq_action *h) +{ + int this_cpu = smp_processor_id(); + struct rq *this_rq = cpu_rq(this_cpu); + enum cpu_idle_type idle = this_rq->idle_balance ? + CPU_IDLE : CPU_NOT_IDLE; + + hmp_force_up_migration(this_cpu); + + rebalance_domains(this_cpu, idle); + + /* + * If this cpu has a pending nohz_balance_kick, then do the + * balancing on behalf of the other idle cpus whose ticks are + * stopped. + */ + nohz_idle_balance(this_cpu, idle); +} + +static inline int on_null_domain(int cpu) +{ + return !rcu_dereference_sched(cpu_rq(cpu)->sd); +} + +/* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. + */ +void trigger_load_balance(struct rq *rq, int cpu) +{ + /* Don't need to rebalance while attached to NULL domain */ + if (time_after_eq(jiffies, rq->next_balance) && + likely(!on_null_domain(cpu))) + raise_softirq(SCHED_SOFTIRQ); +#ifdef CONFIG_NO_HZ_COMMON + if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + nohz_balancer_kick(cpu); +#endif +} + +static void rq_online_fair(struct rq *rq) +{ +#ifdef CONFIG_SCHED_HMP + hmp_online_cpu(rq->cpu); +#endif + update_sysctl(); +} + +static void rq_offline_fair(struct rq *rq) +{ +#ifdef CONFIG_SCHED_HMP + hmp_offline_cpu(rq->cpu); +#endif + update_sysctl(); + + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); +} + +#endif /* CONFIG_SMP */ + +/* + * scheduler tick hitting a task of our scheduling class: + */ +static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &curr->se; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + entity_tick(cfs_rq, se, queued); + } + + if (sched_feat_numa(NUMA)) + task_tick_numa(rq, curr); + + update_rq_runnable_avg(rq, 1); +} + +/* + * called on fork with the child task as argument from the parent's context + * - child not yet on the tasklist + * - preemption disabled + */ +static void task_fork_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se, *curr; + int this_cpu = smp_processor_id(); + struct rq *rq = this_rq(); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + + update_rq_clock(rq); + + cfs_rq = task_cfs_rq(current); + curr = cfs_rq->curr; + + /* + * Not only the cpu but also the task_group of the parent might have + * been changed after parent->se.parent,cfs_rq were copied to + * child->se.parent,cfs_rq. So call __set_task_cpu() to make those + * of child point to valid ones. + */ + rcu_read_lock(); + __set_task_cpu(p, this_cpu); + rcu_read_unlock(); + + update_curr(cfs_rq); + + if (curr) + se->vruntime = curr->vruntime; + place_entity(cfs_rq, se, 1); + + if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { + /* + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. + */ + swap(curr->vruntime, se->vruntime); + resched_task(rq->curr); + } + + se->vruntime -= cfs_rq->min_vruntime; + + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +/* + * Priority of the task has changed. Check to see if we preempt + * the current task. + */ +static void +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +{ + if (!p->se.on_rq) + return; + + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (rq->curr == p) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + check_preempt_curr(rq, p, 0); +} + +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Ensure the task's vruntime is normalized, so that when it's + * switched back to the fair class the enqueue_entity(.flags=0) will + * do the right thing. + * + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when + * the task is sleeping will it still have non-normalized vruntime. + */ + if (!p->on_rq && p->state != TASK_RUNNING) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } + +#ifdef CONFIG_SMP + /* + * Remove our load from contribution when we leave sched_fair + * and ensure we don't carry in an old decay_count if we + * switch back. + */ + if (p->se.avg.decay_count) { + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + __synchronize_entity_decay(&p->se); + subtract_blocked_load_contrib(cfs_rq, + p->se.avg.load_avg_contrib); + } +#endif +} + +/* + * We switched to the sched_fair class. + */ +static void switched_to_fair(struct rq *rq, struct task_struct *p) +{ + if (!p->se.on_rq) + return; + + /* + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. + */ + if (rq->curr == p) + resched_task(rq->curr); + else{ + /* + When task p change priority form RT to normal priority + in switch_from_rt(), it might call pull_rt_task + and potentially double_lock_balance will unlock rq. + Task p might migrate to other CPU and result in task p is NOT at rq. + In this case, it is not necessary to check preempt for rq. + (Because task p is NOT at rq anymore) + and the migrate flow for task p will check preempt in enqueue flow. + So bypass the check_preempt_curr. + */ + if (rq == task_rq(p)) { + check_preempt_curr(rq, p, 0); + } + } +} + +/* Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_curr_task_fair(struct rq *rq) +{ + struct sched_entity *se = &rq->curr->se; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + set_next_entity(cfs_rq, se); + /* ensure bandwidth has been allocated on our new cfs_rq */ + account_cfs_rq_runtime(cfs_rq, 0); + } +} + +void init_cfs_rq(struct cfs_rq *cfs_rq) +{ + cfs_rq->tasks_timeline = RB_ROOT; + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +#ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif +#ifdef CONFIG_SMP + atomic64_set(&cfs_rq->decay_counter, 1); + atomic_long_set(&cfs_rq->removed_load, 0); +#endif +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void task_move_group_fair(struct task_struct *p, int on_rq) +{ + struct cfs_rq *cfs_rq; + /* + * If the task was not on the rq at the time of this cgroup movement + * it must have been asleep, sleeping tasks keep their ->vruntime + * absolute on their old rq until wakeup (needed for the fair sleeper + * bonus in place_entity()). + * + * If it was on the rq, we've just 'preempted' it, which does convert + * ->vruntime to a relative base. + * + * Make sure both cases convert their relative position when migrating + * to another cgroup's rq. This does somewhat interfere with the + * fair sleeper stuff for the first placement, but who cares. + */ + /* + * When !on_rq, vruntime of the task has usually NOT been normalized. + * But there are some cases where it has already been normalized: + * + * - Moving a forked child which is waiting for being woken up by + * wake_up_new_task(). + * - Moving a task which has been woken up by try_to_wake_up() and + * waiting for actually being woken up by sched_ttwu_pending(). + * + * To prevent boost or penalty in the new cfs_rq caused by delta + * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. + */ + if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) + on_rq = 1; + + if (!on_rq) + p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + set_task_rq(p, task_cpu(p)); + if (!on_rq) { + cfs_rq = cfs_rq_of(&p->se); + p->se.vruntime += cfs_rq->min_vruntime; +#ifdef CONFIG_SMP + /* + * migrate_task_rq_fair() will have removed our previous + * contribution, but we must synchronize for ongoing future + * decay. + */ + p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; +#endif + } +} + +void free_fair_sched_group(struct task_group *tg) +{ + int i; + + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + } + + kfree(tg->cfs_rq); + kfree(tg->se); +} + +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se; + int i; + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->se) + goto err; + + tg->shares = NICE_0_LOAD; + + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + + for_each_possible_cpu(i) { + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!se) + goto err_free_rq; + + init_cfs_rq(cfs_rq); + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + } + + return 1; + +err_free_rq: + kfree(cfs_rq); +err: + return 0; +} + +void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + cfs_rq->tg = tg; + cfs_rq->rq = rq; + init_cfs_rq_runtime(cfs_rq); + + tg->cfs_rq[cpu] = cfs_rq; + tg->se[cpu] = se; + + /* se could be NULL for root_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + + se->my_q = cfs_rq; + /* guarantee group entities always have weight */ + update_load_set(&se->load, NICE_0_LOAD); + se->parent = parent; +} + +static DEFINE_MUTEX(shares_mutex); + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int i; + unsigned long flags; + + /* + * We can't change the weight of the root cgroup. + */ + if (!tg->se[0]) + return -EINVAL; + + shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); + + mutex_lock(&shares_mutex); + if (tg->shares == shares) + goto done; + + tg->shares = shares; + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se; + + se = tg->se[i]; + /* Propagate contribution to hierarchy */ + raw_spin_lock_irqsave(&rq->lock, flags); + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se)); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + +done: + mutex_unlock(&shares_mutex); + return 0; +} +#else /* CONFIG_FAIR_GROUP_SCHED */ + +void free_fair_sched_group(struct task_group *tg) { } + +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} + +void unregister_fair_sched_group(struct task_group *tg, int cpu) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + +static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) +{ + struct sched_entity *se = &task->se; + unsigned int rr_interval = 0; + + /* + * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise + * idle runqueue: + */ + if (rq->cfs.load.weight) + rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); + + return rr_interval; +} + +/* + * All the scheduling class methods: + */ +const struct sched_class fair_sched_class = { + .next = &idle_sched_class, + .enqueue_task = enqueue_task_fair, + .dequeue_task = dequeue_task_fair, + .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, + + .check_preempt_curr = check_preempt_wakeup, + + .pick_next_task = pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_fair, + .migrate_task_rq = migrate_task_rq_fair, + + .rq_online = rq_online_fair, + .rq_offline = rq_offline_fair, + + .task_waking = task_waking_fair, +#endif + + .set_curr_task = set_curr_task_fair, + .task_tick = task_tick_fair, + .task_fork = task_fork_fair, + + .prio_changed = prio_changed_fair, + .switched_from = switched_from_fair, + .switched_to = switched_to_fair, + + .get_rr_interval = get_rr_interval_fair, + +#ifdef CONFIG_FAIR_GROUP_SCHED + .task_move_group = task_move_group_fair, +#endif +}; + +#ifdef CONFIG_SCHED_DEBUG +void print_cfs_stats(struct seq_file *m, int cpu) +{ + struct cfs_rq *cfs_rq; + + rcu_read_lock(); + for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + print_cfs_rq(m, cpu, cfs_rq); + rcu_read_unlock(); +} +#endif + +__init void init_sched_fair_class(void) +{ +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + +#ifdef CONFIG_NO_HZ_COMMON + nohz.next_balance = jiffies; + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + cpu_notifier(sched_ilb_notifier, 0); +#endif + + cmp_cputopo_domain_setup(); +#ifdef CONFIG_SCHED_HMP + hmp_cpu_mask_setup(); +#endif +#endif /* SMP */ +} + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE +static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr) +{ + u32 result = curr / max; + return result; +} + +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER +DEFINE_PER_CPU(u32, FREQ_CPU); +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + +/* Called when the CPU Frequency is changed. + * Once for each CPU. + */ +static int cpufreq_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + int cpu = freq->cpu; + struct cpufreq_extents *extents; +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + struct cpumask* mask; + int id; +#endif + + if (freq->flags & CPUFREQ_CONST_LOOPS) + return NOTIFY_OK; + + if (val != CPUFREQ_POSTCHANGE) + return NOTIFY_OK; + + /* if dynamic load scale is disabled, set the load scale to 1.0 */ + if (!hmp_data.freqinvar_load_scale_enabled) { + freq_scale[cpu].curr_scale = 1024; + return NOTIFY_OK; + } + + extents = &freq_scale[cpu]; +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + if (extents->max < extents->const_max){ + extents->throttling=1; + } + else { + extents->throttling=0; + } +#endif + if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) { + /* If our governor was recognised as a single-freq governor, + * use 1.0 + */ + extents->curr_scale = 1024; + } else { +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->const_max, freq->new); +#else + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->max, freq->new); +#endif + } + +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + mask = arch_cpu_is_big(cpu)?&hmp_fast_cpu_mask:&hmp_slow_cpu_mask; + for_each_cpu(id, mask) + freq_scale[id].curr_scale = extents->curr_scale; +#endif + +#if NR_CPUS == 4 +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + switch (cpu) { + case 0: + case 2: + (extents + 1)->curr_scale = extents->curr_scale; + break; + + case 1: + case 3: + (extents - 1)->curr_scale = extents->curr_scale; + break; + + default: + + break; + } +#endif +#endif + +#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER + per_cpu(FREQ_CPU, cpu) = freq->new; +#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */ + return NOTIFY_OK; +} + +/* Called when the CPUFreq governor is changed. + * Only called for the CPUs which are actually changed by the + * userspace. + */ +static int cpufreq_policy_callback(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct cpufreq_policy *policy = data; + struct cpufreq_extents *extents; + int cpu, singleFreq = 0; + static const char performance_governor[] = "performance"; + static const char powersave_governor[] = "powersave"; + + if (event == CPUFREQ_START) + return 0; + + if (event != CPUFREQ_INCOMPATIBLE) + return 0; + + /* CPUFreq governors do not accurately report the range of + * CPU Frequencies they will choose from. + * We recognise performance and powersave governors as + * single-frequency only. + */ + if (!strncmp(policy->governor->name, performance_governor, + strlen(performance_governor)) || + !strncmp(policy->governor->name, powersave_governor, + strlen(powersave_governor))) + singleFreq = 1; + + /* Make sure that all CPUs impacted by this policy are + * updated since we will only get a notification when the + * user explicitly changes the policy on a CPU. + */ + for_each_cpu(cpu, policy->cpus) { + extents = &freq_scale[cpu]; + extents->max = policy->max >> SCHED_FREQSCALE_SHIFT; + extents->min = policy->min >> SCHED_FREQSCALE_SHIFT; +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + extents->const_max = policy->cpuinfo.max_freq >> SCHED_FREQSCALE_SHIFT; +#endif + if (!hmp_data.freqinvar_load_scale_enabled) { + extents->curr_scale = 1024; + } else if (singleFreq) { + extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ; + extents->curr_scale = 1024; + } else { + extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ; +#ifdef CONFIG_SCHED_HMP_ENHANCEMENT + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->const_max, policy->cur); +#else + extents->curr_scale = cpufreq_calc_scale(extents->min, + extents->max, policy->cur); +#endif + } + } + + return 0; +} + +static struct notifier_block cpufreq_notifier = { + .notifier_call = cpufreq_callback, +}; +static struct notifier_block cpufreq_policy_notifier = { + .notifier_call = cpufreq_policy_callback, +}; + +static int __init register_sched_cpufreq_notifier(void) +{ + int ret = 0; + + /* init safe defaults since there are no policies at registration */ + for (ret = 0; ret < CONFIG_NR_CPUS; ret++) { + /* safe defaults */ + freq_scale[ret].max = 1024; + freq_scale[ret].min = 1024; + freq_scale[ret].curr_scale = 1024; + } + + pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n"); + ret = cpufreq_register_notifier(&cpufreq_policy_notifier, + CPUFREQ_POLICY_NOTIFIER); + + if (ret != -EINVAL) + ret = cpufreq_register_notifier(&cpufreq_notifier, + CPUFREQ_TRANSITION_NOTIFIER); + + return ret; +} + +core_initcall(register_sched_cpufreq_notifier); +#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */ + +#ifdef CONFIG_HEVTASK_INTERFACE +/* + * * This allows printing both to /proc/task_detect and + * * to the console + * */ +#ifndef CONFIG_KGDB_KDB +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) +#else +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else if (__get_cpu_var(kdb_in_use) == 1) \ + kdb_printf(x); \ + else \ + printk(x); \ + } while (0) +#endif + +static int task_detect_show(struct seq_file *m, void *v) +{ + struct task_struct *p; + unsigned long flags; + unsigned int i; + +#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE + for(i=0;i<NR_CPUS;i++){ + SEQ_printf(m,"%5d ",freq_scale[i].curr_scale); + } +#endif + + SEQ_printf(m, "\n%lu\n ",jiffies_to_cputime(jiffies)); + + for(i=0;i<NR_CPUS;i++){ + raw_spin_lock_irqsave(&cpu_rq(i)->lock,flags); + if(cpu_online(i)){ + list_for_each_entry(p,&cpu_rq(i)->cfs_tasks,se.group_node){ + SEQ_printf(m, "%lu %5d %5d %lu (%15s)\n ", + p->se.avg.load_avg_ratio,p->pid,task_cpu(p), + (p->utime+p->stime),p->comm); + } + } + raw_spin_unlock_irqrestore(&cpu_rq(i)->lock,flags); + + } + + return 0; +} + +static int task_detect_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, task_detect_show, NULL); +} + +static const struct file_operations task_detect_fops = { + .open = task_detect_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init init_task_detect_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = proc_create("task_detect", 0444, NULL, &task_detect_fops); + if (!pe) + return -ENOMEM; + return 0; +} + +__initcall(init_task_detect_procfs); +#endif /* CONFIG_HEVTASK_INTERFACE */ diff --git a/kernel/sched/features.h b/kernel/sched/features.h new file mode 100644 index 000000000..99399f8e4 --- /dev/null +++ b/kernel/sched/features.h @@ -0,0 +1,72 @@ +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ +SCHED_FEAT(START_DEBIT, true) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, false) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, true) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ +SCHED_FEAT(CACHE_HOT_BUDDY, true) + +/* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + +/* + * Use arch dependent cpu power functions + */ +SCHED_FEAT(ARCH_POWER, true) + +SCHED_FEAT(HRTICK, false) +SCHED_FEAT(DOUBLE_TICK, false) +SCHED_FEAT(LB_BIAS, true) + +/* + * Decrement CPU power based on time not spent running tasks + */ +SCHED_FEAT(NONTASK_POWER, true) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, true) + +SCHED_FEAT(FORCE_SD_OVERLAP, false) +SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(LB_MIN, false) + +/* + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing=. Allow PTE scanning to be forced on UMA machines + * for debugging the core machinery. + */ +#ifdef CONFIG_NUMA_BALANCING +SCHED_FEAT(NUMA, false) +SCHED_FEAT(NUMA_FORCE, false) +#endif diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c new file mode 100644 index 000000000..d8da01008 --- /dev/null +++ b/kernel/sched/idle_task.c @@ -0,0 +1,115 @@ +#include "sched.h" + +/* + * idle-task scheduling class. + * + * (NOTE: these are not related to SCHED_IDLE tasks which are + * handled in sched/fair.c) + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) +{ + return task_cpu(p); /* IDLE tasks as never migrated */ +} + +static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) +{ + idle_exit_fair(rq); + rq_last_tick_reset(rq); +} + +static void post_schedule_idle(struct rq *rq) +{ + idle_enter_fair(rq); +} +#endif /* CONFIG_SMP */ +/* + * Idle tasks are unconditionally rescheduled: + */ +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +{ + resched_task(rq->idle); +} + +static struct task_struct *pick_next_task_idle(struct rq *rq) +{ + schedstat_inc(rq, sched_goidle); +#ifdef CONFIG_SMP + /* Trigger the post schedule to do an idle_enter for CFS */ + rq->post_schedule = 1; +#endif + return rq->idle; +} + +/* + * It is not legal to sleep in the idle task - print a warning + * message if some code attempts to do it: + */ +static void +dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) +{ + raw_spin_unlock_irq(&rq->lock); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + raw_spin_lock_irq(&rq->lock); +} + +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_idle(struct rq *rq) +{ +} + +static void switched_to_idle(struct rq *rq, struct task_struct *p) +{ + BUG(); +} + +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +{ + BUG(); +} + +static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +/* + * Simple, special scheduling class for the per-CPU idle tasks: + */ +const struct sched_class idle_sched_class = { + /* .next is NULL */ + /* no enqueue/yield_task for idle tasks */ + + /* dequeue is not valid, we print a debug message there: */ + .dequeue_task = dequeue_task_idle, + + .check_preempt_curr = check_preempt_curr_idle, + + .pick_next_task = pick_next_task_idle, + .put_prev_task = put_prev_task_idle, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_idle, + .pre_schedule = pre_schedule_idle, + .post_schedule = post_schedule_idle, +#endif + + .set_curr_task = set_curr_task_idle, + .task_tick = task_tick_idle, + + .get_rr_interval = get_rr_interval_idle, + + .prio_changed = prio_changed_idle, + .switched_to = switched_to_idle, +}; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c new file mode 100644 index 000000000..5c1622cda --- /dev/null +++ b/kernel/sched/rt.c @@ -0,0 +1,2993 @@ +/* + * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR + * policies) + */ + +#include "sched.h" +#if defined(CONFIG_HMP_TRACER) || defined(CONFIG_MT_SCHED_TRACE) +#include <trace/events/sched.h> +#endif + +#include <linux/slab.h> + +int sched_rr_timeslice = RR_TIMESLICE; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +struct rt_bandwidth def_rt_bandwidth; + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + raw_spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + raw_spin_lock(&rt_b->rt_runtime_lock); + start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); + raw_spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_PROVE_LOCKING +DEFINE_RAW_SPINLOCK(rt_rq_runtime_spinlock); +#define MAX_SPIN_KEY 10 +DEFINE_PER_CPU(struct lock_class_key, spin_key[MAX_SPIN_KEY]); +DEFINE_PER_CPU(int, spin_key_idx); +#endif +void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ + struct rt_prio_array *array; + int i; +#ifdef CONFIG_PROVE_LOCKING + int cpu, idx; +#endif + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.next = MAX_RT_PRIO; + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; + plist_head_init(&rt_rq->pushable_tasks); +#endif + + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + /* MTK patch: prevent to continue borrow RT runtime after restore the default value*/ + rt_rq->rt_disable_borrow = 0; +#ifdef CONFIG_PROVE_LOCKING + raw_spin_lock(&rt_rq_runtime_spinlock); + cpu = rq->cpu; + idx = per_cpu(spin_key_idx, cpu); +#endif + raw_spin_lock_init(&rt_rq->rt_runtime_lock); +#ifdef CONFIG_PROVE_LOCKING + lockdep_set_class(&rt_rq->rt_runtime_lock, &per_cpu(spin_key[idx], cpu)); + per_cpu(spin_key_idx, cpu)++; + BUG_ON(per_cpu(spin_key_idx, cpu) >= MAX_SPIN_KEY); + raw_spin_unlock(&rt_rq_runtime_spinlock); +#endif +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} + +#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) + +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!rt_entity_is_task(rt_se)); +#endif + return container_of(rt_se, struct task_struct, rt); +} + +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +{ + return rt_rq->rq; +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + return rt_se->rt_rq; +} + +void free_rt_sched_group(struct task_group *tg) +{ + int i; + + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); + + for_each_possible_cpu(i) { + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->rt_rq); + kfree(tg->rt_se); +} + +void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; + rt_rq->tg = tg; + + tg->rt_rq[cpu] = rt_rq; + tg->rt_se[cpu] = rt_se; + + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + + rt_se->my_q = rt_rq; + rt_se->parent = parent; + INIT_LIST_HEAD(&rt_se->run_list); +} + +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; + int i; + + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); + + for_each_possible_cpu(i) { + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_rq) + goto err; + + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_se) + goto err_free_rq; + + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); + } + + return 1; + +err_free_rq: + kfree(rt_rq); +err: + return 0; +} + +#else /* CONFIG_RT_GROUP_SCHED */ + +#define rt_entity_is_task(rt_se) (1) + +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ + return container_of(rt_se, struct task_struct, rt); +} + +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +{ + return container_of(rt_rq, struct rq, rt); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + struct task_struct *p = rt_task_of(rt_se); + struct rq *rq = task_rq(p); + + return &rq->rt; +} + +void free_rt_sched_group(struct task_group *tg) { } + +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +#if defined(CONFIG_MT_RT_SCHED) +extern struct cpumask hmp_fast_cpu_mask; +extern struct cpumask hmp_slow_cpu_mask; +#endif + +#ifdef CONFIG_SMP + +static inline int rt_overloaded(struct rq *rq) +{ + return atomic_read(&rq->rd->rto_count); +} + +static inline void rt_set_overload(struct rq *rq) +{ + if (!rq->online) + return; + + cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); + /* + * Make sure the mask is visible before we set + * the overload count. That is checked to determine + * if we should look at the mask. It would be a shame + * if we looked at the mask, but the mask was not + * updated yet. + */ + wmb(); + atomic_inc(&rq->rd->rto_count); +} + +static inline void rt_clear_overload(struct rq *rq) +{ + if (!rq->online) + return; + + /* the order here really doesn't matter */ + atomic_dec(&rq->rd->rto_count); + cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); +} + +#ifdef CONFIG_MT_RT_SCHED +static inline int has_rt_task_in_little(void) +{ + int cpu; + struct rq *rq; + + for_each_cpu(cpu, &hmp_slow_cpu_mask){ + if (!cpu_online(cpu)) + continue; + + rq = cpu_rq(cpu); + if(rq->rt.rt_nr_running >= 1) + return 1; + } + + return 0; +} +#endif + +static void update_rt_migration(struct rt_rq *rt_rq) +{ + if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { + if (!rt_rq->overloaded) { + rt_set_overload(rq_of_rt_rq(rt_rq)); + rt_rq->overloaded = 1; + } + } else if (rt_rq->overloaded) { + rt_clear_overload(rq_of_rt_rq(rt_rq)); + rt_rq->overloaded = 0; + } +} + +static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + return; + + p = rt_task_of(rt_se); + rt_rq = &rq_of_rt_rq(rt_rq)->rt; + + rt_rq->rt_nr_total++; + if (p->nr_cpus_allowed > 1) + rt_rq->rt_nr_migratory++; + + update_rt_migration(rt_rq); +} + +static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + return; + + p = rt_task_of(rt_se); + rt_rq = &rq_of_rt_rq(rt_rq)->rt; + + rt_rq->rt_nr_total--; + if (p->nr_cpus_allowed > 1) + rt_rq->rt_nr_migratory--; + + update_rt_migration(rt_rq); +} + +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); + plist_node_init(&p->pushable_tasks, p->prio); + plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); + + /* Update the highest prio pushable task */ + if (p->prio < rq->rt.highest_prio.next) + rq->rt.highest_prio.next = p->prio; +} + +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); + + /* Update the new highest prio pushable task */ + if (has_pushable_tasks(rq)) { + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + rq->rt.highest_prio.next = p->prio; + } else + rq->rt.highest_prio.next = MAX_RT_PRIO; +} + +#else + +static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +} + +static inline +void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +} + +#endif /* CONFIG_SMP */ + +static inline int on_rt_rq(struct sched_rt_entity *rt_se) +{ + return !list_empty(&rt_se->run_list); +} + +#ifdef CONFIG_RT_GROUP_SCHED + +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) +{ + if (!rt_rq->tg) + return RUNTIME_INF; + + return rt_rq->rt_runtime; +} + +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ + return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); +} + +typedef struct task_group *rt_rq_iter_t; + +static inline struct task_group *next_task_group(struct task_group *tg) +{ + do { + tg = list_entry_rcu(tg->list.next, + typeof(struct task_group), list); + } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); + + if (&tg->list == &task_groups) + tg = NULL; + + return tg; +} + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for (iter = container_of(&task_groups, typeof(*iter), list); \ + (iter = next_task_group(iter)) && \ + (rt_rq = iter->rt_rq[cpu_of(rq)]);) + +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_add_rcu(&rt_rq->leaf_rt_rq_list, + &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_del_rcu(&rt_rq->leaf_rt_rq_list); +} + +#define for_each_leaf_rt_rq(rt_rq, rq) \ + list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) + +#define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = rt_se->parent) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ + return rt_se->my_q; +} + +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se); + +static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ + struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct sched_rt_entity *rt_se; + + int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + + rt_se = rt_rq->tg->rt_se[cpu]; + + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se, false); + if (rt_rq->highest_prio.curr < curr->prio) + resched_task(curr); + } +} + +static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ + struct sched_rt_entity *rt_se; + int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + + rt_se = rt_rq->tg->rt_se[cpu]; + + if (rt_se && on_rt_rq(rt_se)) + dequeue_rt_entity(rt_se); +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} + +static int rt_se_boosted(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = group_rt_rq(rt_se); + struct task_struct *p; + + if (rt_rq) + return !!rt_rq->rt_nr_boosted; + + p = rt_task_of(rt_se); + return p->prio != p->normal_prio; +} + +#ifdef CONFIG_SMP +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_rq(smp_processor_id())->rd->span; +} +#else +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_online_mask; +} +#endif + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; +} + +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ + return &rt_rq->tg->rt_bandwidth; +} + +void unthrottle_offline_rt_rqs(struct rq *rq) { + struct rt_rq *rt_rq; + + for_each_leaf_rt_rq(rt_rq, rq) { + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + if (rt_rq_throttled(rt_rq)){ + rt_rq->rt_throttled = 0; + printk_deferred("sched: RT throttling inactivated\n"); + } + } +} + +#else /* !CONFIG_RT_GROUP_SCHED */ + +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) +{ + return rt_rq->rt_runtime; +} + +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ + return ktime_to_ns(def_rt_bandwidth.rt_period); +} + +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + +#define for_each_leaf_rt_rq(rt_rq, rq) \ + for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +#define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = NULL) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ + return NULL; +} + +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ + if (rt_rq->rt_nr_running) + resched_task(rq_of_rt_rq(rt_rq)->curr); +} + +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} + +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_online_mask; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ + return &def_rt_bandwidth; +} + +void unthrottle_offline_rt_rqs(struct rq *rq) { } + +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SMP +/* + * We ran out of runtime, see if we can borrow some from our neighbours. + */ +//#define MTK_DEBUG_CGROUP +static int do_balance_runtime(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; + int i, weight, more = 0; + u64 rt_period; + + weight = cpumask_weight(rd->span); + + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); + + if (rt_rq->rt_disable_borrow ==1){ + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); + return 0; + } + rt_period = ktime_to_ns(rt_b->rt_period); + +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG " do_balance_runtime curr_cpu=%d, dst_cpu=%d, span=%lu\n", + smp_processor_id(), rt_rq->rq->cpu, rd->span->bits[0]); +#endif + for_each_cpu(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + + if (iter == rt_rq) + continue; + + /* MTK Patch: use try lock to prevent deadlock */ + // raw_spin_lock(&iter->rt_runtime_lock); +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG " do_balance_runtime get lock cpu=%d\n", i); +#endif + if(!raw_spin_trylock(&iter->rt_runtime_lock)){ +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG " do_balance_runtime try lock fail cpu=%d\n", i); +#endif + continue; + } + /* + * Either all rqs have inf runtime and there's nothing to steal + * or __disable_runtime() below sets a specific rq to inf to + * indicate its been disabled and disalow stealing. + */ + if (iter->rt_disable_borrow ==1) + goto next; + if (iter->rt_runtime == RUNTIME_INF) + goto next; + + /* + * From runqueues with spare time, take 1/n part of their + * spare time, but no more than our period. + */ + diff = iter->rt_runtime - iter->rt_time; + +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG "borrow, dst_cpu=%d, src_cpu=%d, src_cpu2=%d, src_addr=%x, dst_addr=%x,dst->rt_runtime=%llu, src->rt_runtime=%llu, diff=%lld, span=%lu\n", + rt_rq->rq->cpu, i, iter->rq->cpu, iter, + rt_rq, rt_rq->rt_runtime, iter->rt_runtime, diff, rd->span->bits[0]); +#endif + if (diff > 0) { + diff = div_u64((u64)diff, weight); + if (rt_rq->rt_runtime + diff > rt_period) + diff = rt_period - rt_rq->rt_runtime; + iter->rt_runtime -= diff; + rt_rq->rt_runtime += diff; + more = 1; +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG "borrow successfully, dst_cpu=%d, src_cpu=%d, src_cpu2=%d, src_addr=%x, dst_addr=%x,dst->rt_runtime=%llu, src->rt_runtime=%llu, diff=%lld, span=%lu\n", + rt_rq->rq->cpu, i, iter->rq->cpu, iter, + rt_rq, rt_rq->rt_runtime, iter->rt_runtime, diff, rd->span->bits[0]); +#endif + if (rt_rq->rt_runtime == rt_period) { + raw_spin_unlock(&iter->rt_runtime_lock); + break; + } + } +next: + raw_spin_unlock(&iter->rt_runtime_lock); + } + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); + + return more; +} + +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ +static void __disable_runtime(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + rt_rq_iter_t iter; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_rt_rq(rt_rq, iter, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + s64 want; + int i; + + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); + /* MTK Patch: prevent race condition */ + rt_rq->rt_disable_borrow = 1; + /* + * Either we're all inf and nobody needs to borrow, or we're + * already disabled and thus have nothing to do, or we have + * exactly the right amount of runtime to take out. + */ +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG "0. disable_runtime, cpu=%d, rd->span=%lu, rt_rq_addr=%x, rt_rq->rt_runtime=%llu, rt_b->rt_runtime=%llu\n", + rt_rq->rq->cpu, rd->span->bits[0], + rt_rq, rt_rq->rt_runtime, rt_b->rt_runtime); +#endif + if (rt_rq->rt_runtime == RUNTIME_INF || + rt_rq->rt_runtime == rt_b->rt_runtime) + goto balanced; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + + /* + * Calculate the difference between what we started out with + * and what we current have, that's the amount of runtime + * we lend and now have to reclaim. + */ + want = rt_b->rt_runtime - rt_rq->rt_runtime; + + /* + * Greedy reclaim, take back as much as we can. + */ + for_each_cpu(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG "0. disable_runtime, cpu=%d,rt_b->rt_runtime=%llu, rt_rq->rt_runtime=%llu, want=%lld, rd->span=%lu\n", + rt_rq->rq->cpu, rt_b->rt_runtime, rt_rq->rt_runtime, want, rd->span->bits[0]); +#endif + + /* + * Can't reclaim from ourselves or disabled runqueues. + */ + if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF || iter->rt_disable_borrow){ +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG "1. disable_runtime, cpu=%d, %llu\n", + i, iter->rt_runtime); +#endif + continue; + } + + raw_spin_lock(&iter->rt_runtime_lock); +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG "2-1. disable_runtime cpu=%d, want=%lld, iter->rt_runtime=%llu\n", + i, want, iter->rt_runtime); +#endif + if (want > 0) { + diff = min_t(s64, iter->rt_runtime, want); + iter->rt_runtime -= diff; + want -= diff; +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG "2. disable_runtime, rt_runtime=%llu, diff=%lld, want=%lld\n", + iter->rt_runtime, diff, want); +#endif + } else { + iter->rt_runtime -= want; + want -= want; +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG "3. disable_runtime, rt_runtime=%llu, want=%lld\n", iter->rt_runtime, want); +#endif + } + raw_spin_unlock(&iter->rt_runtime_lock); + + if (!want) + break; + } + + raw_spin_lock(&rt_rq->rt_runtime_lock); + /* + * We cannot be left wanting - that would mean some runtime + * leaked out of the system. + */ + if(want){ +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG "4. disable_runtime, want=%lld, rt_rq->rt_runtime=%llu\n", + want, rt_rq->rt_runtime); + { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, 0); + printk(KERN_EMERG "4-0. disable_runtime %llu\n", iter->rt_runtime); + iter = sched_rt_period_rt_rq(rt_b, 1); + printk(KERN_EMERG "4-1. disable_runtime %llu\n", iter->rt_runtime); + iter = sched_rt_period_rt_rq(rt_b, 2); + printk(KERN_EMERG "4-2. disable_runtime %llu\n", iter->rt_runtime); + iter = sched_rt_period_rt_rq(rt_b, 3); + printk(KERN_EMERG "4-3. disable_runtime %llu\n", iter->rt_runtime); + } +#endif + + BUG_ON(want); + } +balanced: + /* + * Disable all the borrow logic by pretending we have inf + * runtime - in which case borrowing doesn't make sense. + */ + // MTK patch: prevent normal task could run anymore, use rt_disable_borrow + //rt_rq->rt_runtime = RUNTIME_INF; + rt_rq->rt_runtime = rt_b->rt_runtime; + rt_rq->rt_throttled = 0; +#ifdef MTK_DEBUG_CGROUP + { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, 0); + printk(KERN_EMERG "5-0. disable_runtime %llu\n", iter->rt_runtime); + iter = sched_rt_period_rt_rq(rt_b, 1); + printk(KERN_EMERG "5-1. disable_runtime %llu\n", iter->rt_runtime); + iter = sched_rt_period_rt_rq(rt_b, 2); + printk(KERN_EMERG "5-2. disable_runtime %llu\n", iter->rt_runtime); + iter = sched_rt_period_rt_rq(rt_b, 3); + printk(KERN_EMERG "5-3. disable_runtime %llu\n", iter->rt_runtime); + } +#endif + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); +#ifdef MTK_DEBUG_CGROUP + printk(KERN_ERR "disable_runtime after: rt_rq->rt_runtime=%llu rq_rt->rt_throttled=%d\n", + rt_rq->rt_runtime, rt_rq->rt_throttled); +#endif + } + + mt_sched_printf(sched_rt_info, "cpu=%d rt_throttled=%d", rq->cpu, rq->rt.rt_throttled); +} + +static void disable_runtime(struct rq *rq) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + __disable_runtime(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +static void __enable_runtime(struct rq *rq) +{ + rt_rq_iter_t iter; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + /* + * Reset each runqueue's bandwidth settings + */ + for_each_rt_rq(rt_rq, iter, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_disable_borrow ){ + #ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG "enable_runtime %d \n", rq->cpu); + #endif + rt_rq->rt_runtime = rt_b->rt_runtime; + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_disable_borrow = 0; + } + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); + } + + mt_sched_printf(sched_rt_info, "cpu=%d rt_throttled=%d", rq->cpu, rq->rt.rt_throttled); +} + +static void enable_runtime(struct rq *rq) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + __enable_runtime(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} + +static int balance_runtime(struct rt_rq *rt_rq) +{ + int more = 0; + + if (!sched_feat(RT_RUNTIME_SHARE)) + return more; + + if (rt_rq->rt_time > rt_rq->rt_runtime) { + raw_spin_unlock(&rt_rq->rt_runtime_lock); + more = do_balance_runtime(rt_rq); + raw_spin_lock(&rt_rq->rt_runtime_lock); + } + + return more; +} +#else /* !CONFIG_SMP */ +static inline int balance_runtime(struct rt_rq *rt_rq) +{ + return 0; +} +#endif /* CONFIG_SMP */ + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) +{ + int i, idle = 1, throttled = 0; + const struct cpumask *span; + + span = sched_rt_period_mask(); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * FIXME: isolated CPUs should really leave the root task group, + * whether they are isolcpus or were isolated via cpusets, lest + * the timer run on a CPU which does not service all runqueues, + * potentially leaving other CPUs indefinitely throttled. If + * isolation is really required, the user will turn the throttle + * off to kill the perturbations it causes anyway. Meanwhile, + * this maintains functionality for boot and/or troubleshooting. + */ + if (rt_b == &root_task_group.rt_bandwidth) + span = cpu_online_mask; +#endif + +#ifdef MTK_DEBUG_CGROUP + printk(KERN_EMERG " do_sched_rt_period_timer curr_cpu=%d \n", smp_processor_id()); +#endif + for_each_cpu(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); + struct rq *rq = rq_of_rt_rq(rt_rq); + + raw_spin_lock(&rq->lock); + if (rt_rq->rt_time) { + u64 runtime; + u64 runtime_pre, rt_time_pre; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_throttled) { + runtime_pre = rt_rq->rt_runtime; + balance_runtime(rt_rq); + rt_time_pre = rt_rq->rt_time; + } + runtime = rt_rq->rt_runtime; + rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); + if (rt_rq->rt_throttled) { + printk_deferred("sched: cpu=%d, [%llu -> %llu]" + " -= min(%llu, %d*[%llu -> %llu])" + "\n", i, rt_time_pre, + rt_rq->rt_time, rt_time_pre, + overrun, runtime_pre, runtime); + } + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { + printk_deferred("sched: RT throttling inactivated" + " cpu=%d\n", i); + rt_rq->rt_throttled = 0; + mt_sched_printf(sched_rt_info, "cpu=%d rt_throttled=%d", + rq_cpu(rq), rq->rt.rt_throttled); + + enqueue = 1; + + /* + * Force a clock update if the CPU was idle, + * lest wakeup -> unthrottle time accumulate. + */ + if (rt_rq->rt_nr_running && rq->curr == rq->idle) + rq->skip_clock_update = -1; + } + if (rt_rq->rt_time || rt_rq->rt_nr_running) + idle = 0; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } else if (rt_rq->rt_nr_running) { + idle = 0; + if (!rt_rq_throttled(rt_rq)) + enqueue = 1; + } + if (rt_rq->rt_throttled) + throttled = 1; + + if (enqueue) + sched_rt_rq_enqueue(rt_rq); + raw_spin_unlock(&rq->lock); + } + + if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) + return 1; + + return idle; +} + +static inline int rt_se_prio(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq) + return rt_rq->highest_prio.curr; +#endif + + return rt_task_of(rt_se)->prio; +} +DEFINE_PER_CPU(u64, exec_delta_time); +DEFINE_PER_CPU(u64, clock_task); +DEFINE_PER_CPU(u64, exec_start); +DEFINE_PER_CPU(struct task_struct, exec_task); +static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) +{ + u64 runtime = sched_rt_runtime(rt_rq); + u64 runtime_pre; + + if (rt_rq->rt_throttled) + return rt_rq_throttled(rt_rq); + + if (runtime >= sched_rt_period(rt_rq)) + return 0; + + runtime_pre = runtime; + balance_runtime(rt_rq); + runtime = sched_rt_runtime(rt_rq); + if (runtime == RUNTIME_INF) + return 0; + + if (rt_rq->rt_time > runtime) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + int cpu = rq_cpu(rt_rq->rq); + + printk_deferred("sched: cpu=%d rt_time %llu <-> runtime" + " [%llu -> %llu], exec_task[%d:%s], prio=%d, exec_delta_time[%llu]" + ", clock_task[%llu], exec_start[%llu]\n", + cpu, rt_rq->rt_time, runtime_pre, runtime, + per_cpu(exec_task, cpu).pid, + per_cpu(exec_task, cpu).comm, + per_cpu(exec_task, cpu).prio, + per_cpu(exec_delta_time, cpu), + per_cpu(clock_task, cpu), + per_cpu(exec_start, cpu)); + /* + * Don't actually throttle groups that have no runtime assigned + * but accrue some time due to boosting. + */ + /* MTK patch: print rt throttle everytime*/ + if (likely(rt_b->rt_runtime)) { + // static bool once = false; + + rt_rq->rt_throttled = 1; + + // if (!once) { + // once = true; + printk_deferred("sched: RT throttling activated cpu=%d\n", + cpu); + // } + mt_sched_printf(sched_rt_info, "cpu=%d rt_throttled=%d", + cpu, rt_rq->rt_throttled); + + } else { + /* + * In case we did anyway, make it go away, + * replenishment is a joke, since it will replenish us + * with exactly 0 ns. + */ + rt_rq->rt_time = 0; + } + + if (rt_rq_throttled(rt_rq)) { + sched_rt_rq_dequeue(rt_rq); + return 1; + } + } + + return 0; +} + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static void update_curr_rt(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_rt_entity *rt_se = &curr->rt; + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + u64 delta_exec; + int cpu = rq_cpu(rq); + + if (curr->sched_class != &rt_sched_class) + return; + + delta_exec = rq->clock_task - curr->se.exec_start; + if (unlikely((s64)delta_exec <= 0)) + return; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + per_cpu(exec_task, cpu).pid = curr->pid; + per_cpu(exec_task, cpu).prio = curr->prio; + strcpy(per_cpu(exec_task, cpu).comm, curr->comm); + per_cpu(exec_delta_time, cpu) = delta_exec; + per_cpu(clock_task, cpu) = rq->clock_task; + per_cpu(exec_start, cpu) = curr->se.exec_start; + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq->clock_task; + cpuacct_charge(curr, delta_exec); + + sched_rt_avg_update(rq, delta_exec); + + if (!rt_bandwidth_enabled()) + return; + + for_each_sched_rt_entity(rt_se) { + rt_rq = rt_rq_of_se(rt_se); + + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + } +} + +#if defined CONFIG_SMP + +static void +inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Change rq's cpupri only if rt_rq is the top queue. + */ + if (&rq->rt != rt_rq) + return; +#endif + if (rq->online && prio < prev_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, prio); +} + +static void +dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Change rq's cpupri only if rt_rq is the top queue. + */ + if (&rq->rt != rt_rq) + return; +#endif + if (rq->online && rt_rq->highest_prio.curr != prev_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); +} + +#else /* CONFIG_SMP */ + +static inline +void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} +static inline +void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} + +#endif /* CONFIG_SMP */ + +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +static void +inc_rt_prio(struct rt_rq *rt_rq, int prio) +{ + int prev_prio = rt_rq->highest_prio.curr; + + if (prio < prev_prio) + rt_rq->highest_prio.curr = prio; + + inc_rt_prio_smp(rt_rq, prio, prev_prio); +} + +static void +dec_rt_prio(struct rt_rq *rt_rq, int prio) +{ + int prev_prio = rt_rq->highest_prio.curr; + + if (rt_rq->rt_nr_running) { + + WARN_ON(prio < prev_prio); + + /* + * This may have been our highest task, and therefore + * we may have some recomputation to do + */ + if (prio == prev_prio) { + struct rt_prio_array *array = &rt_rq->active; + + rt_rq->highest_prio.curr = + sched_find_first_bit(array->bitmap); + } + + } else + rt_rq->highest_prio.curr = MAX_RT_PRIO; + + dec_rt_prio_smp(rt_rq, prio, prev_prio); +} + +#else + +static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} +static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} + +#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED + +static void +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted++; + + if (rt_rq->tg) + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); +} + +static void +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted--; + + WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); +} + +#else /* CONFIG_RT_GROUP_SCHED */ + +static void +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + start_rt_bandwidth(&def_rt_bandwidth); +} + +static inline +void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} + +#endif /* CONFIG_RT_GROUP_SCHED */ + +static inline +void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + int prio = rt_se_prio(rt_se); + + WARN_ON(!rt_prio(prio)); + rt_rq->rt_nr_running++; + + inc_rt_prio(rt_rq, prio); + inc_rt_migration(rt_se, rt_rq); + inc_rt_group(rt_se, rt_rq); +} + +static inline +void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + WARN_ON(!rt_prio(rt_se_prio(rt_se))); + WARN_ON(!rt_rq->rt_nr_running); + rt_rq->rt_nr_running--; + + dec_rt_prio(rt_rq, rt_se_prio(rt_se)); + dec_rt_migration(rt_se, rt_rq); + dec_rt_group(rt_se, rt_rq); +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +{ + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + struct rt_prio_array *array = &rt_rq->active; + struct rt_rq *group_rq = group_rt_rq(rt_se); + struct list_head *queue = array->queue + rt_se_prio(rt_se); + + /* + * Don't enqueue the group if its throttled, or when empty. + * The latter is a consequence of the former when a child group + * get throttled and the current group doesn't have any other + * active members. + */ +// if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && ( !group_rq->rt_nr_running)) + return; + + if (!rt_rq->rt_nr_running) + list_add_leaf_rt_rq(rt_rq); + + if (head) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + __set_bit(rt_se_prio(rt_se), array->bitmap); + + inc_rt_tasks(rt_se, rt_rq); +} + +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + struct rt_prio_array *array = &rt_rq->active; + + list_del_init(&rt_se->run_list); + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + dec_rt_tasks(rt_se, rt_rq); + if (!rt_rq->rt_nr_running) + list_del_leaf_rt_rq(rt_rq); +} + +/* + * Because the prio of an upper entry depends on the lower + * entries, we must remove entries top - down. + */ +static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +{ + struct sched_rt_entity *back = NULL; + + for_each_sched_rt_entity(rt_se) { + rt_se->back = back; + back = rt_se; + } + + for (rt_se = back; rt_se; rt_se = rt_se->back) { + if (on_rt_rq(rt_se)) + __dequeue_rt_entity(rt_se); + } +} + +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +{ + dequeue_rt_stack(rt_se); + for_each_sched_rt_entity(rt_se) + __enqueue_rt_entity(rt_se, head); +} + +static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +{ + dequeue_rt_stack(rt_se); + + for_each_sched_rt_entity(rt_se) { + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq && rt_rq->rt_nr_running) + __enqueue_rt_entity(rt_se, false); + } +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void +enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) +{ + struct sched_rt_entity *rt_se = &p->rt; + + if (flags & ENQUEUE_WAKEUP) + rt_se->timeout = 0; + + enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); + + inc_nr_running(rq); +} + +static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) +{ + struct sched_rt_entity *rt_se = &p->rt; + + update_curr_rt(rq); + dequeue_rt_entity(rt_se); + + dequeue_pushable_task(rq, p); + + dec_nr_running(rq); +} + +/* + * Put task to the head or the end of the run list without the overhead of + * dequeue followed by enqueue. + */ +static void +requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) +{ + if (on_rt_rq(rt_se)) { + struct rt_prio_array *array = &rt_rq->active; + struct list_head *queue = array->queue + rt_se_prio(rt_se); + + if (head) + list_move(&rt_se->run_list, queue); + else + list_move_tail(&rt_se->run_list, queue); + } +} + +static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) +{ + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq; + + for_each_sched_rt_entity(rt_se) { + rt_rq = rt_rq_of_se(rt_se); + requeue_rt_entity(rt_rq, rt_se, head); + } +} + +static void yield_task_rt(struct rq *rq) +{ + requeue_task_rt(rq, rq->curr, 0); +} + +#ifdef CONFIG_SMP +static int find_lowest_rq(struct task_struct *task); + +static int +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) +{ + struct task_struct *curr; + struct rq *rq; + int cpu; + + cpu = task_cpu(p); + + if (p->nr_cpus_allowed == 1) + goto out; + + /* For anything but wake ups, just return the task_cpu */ + if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + goto out; + + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + + /* + * If the current task on @p's runqueue is an RT task, then + * try to see if we can wake this RT task up on another + * runqueue. Otherwise simply start this RT task + * on its current runqueue. + * + * We want to avoid overloading runqueues. If the woken + * task is a higher priority, then it will stay on this CPU + * and the lower prio task should be moved to another CPU. + * Even though this will probably make the lower prio task + * lose its cache, we do not want to bounce a higher task + * around just because it gave up its CPU, perhaps for a + * lock? + * + * For equal prio tasks, we just let the scheduler sort it out. + * + * Otherwise, just let it ride on the affined RQ and the + * post-schedule router will push the preempted task away + * + * This test is optimistic, if we get it wrong the load-balancer + * will have to sort it out. + */ + if(curr){ + mt_sched_printf(sched_rt_info, "1 select_task_rq_rt cpu=%d p=%d:%s:prio=%d:0x%x curr=%d:%s:prio=%d:0x%x", + cpu, p->pid, p->comm, p->prio, p->nr_cpus_allowed, + curr->pid, curr->comm, curr->prio, curr->nr_cpus_allowed); + }else{ + mt_sched_printf(sched_rt_info, "1 select_task_rq_rt cpu=%d p=%d:%s:prio=%d:0x%x", + cpu, p->pid, p->comm, p->prio, p->nr_cpus_allowed); + } + +#if defined(CONFIG_MT_RT_SCHED)||defined(CONFIG_MT_SCHED_INTEROP) + /* if the task is allowed to put more than one CPU. */ + if ( (p->nr_cpus_allowed > 1) ){ +#else + if (curr && + unlikely(rt_task(curr)) && + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio) + && (p->nr_cpus_allowed > 1)) { +#endif + int target = find_lowest_rq(p); + + if (target != -1) + cpu = target; + + mt_sched_printf(sched_rt_info, "2. select_task_rq_rt %d:%s to cpu=%d", p->pid, p->comm, cpu); + } + rcu_read_unlock(); + +out: + return cpu; +} + +static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) +{ + if (rq->curr->nr_cpus_allowed == 1) + return; + + if (p->nr_cpus_allowed != 1 + && cpupri_find(&rq->rd->cpupri, p, NULL)) + return; + + if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + return; + + /* + * There appears to be other cpus that can accept + * current and none to run 'p', so lets reschedule + * to try and push current away: + */ + requeue_task_rt(rq, p, 1); + resched_task(rq->curr); +} + +#endif /* CONFIG_SMP */ + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) +{ + mt_sched_printf(sched_rt_info, "check_preempt_curr_rt %d:%d:%s", p->prio, rq->curr->prio, p->comm); + if (p->prio < rq->curr->prio) { + resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * If: + * + * - the newly woken task is of equal priority to the current task + * - the newly woken task is non-migratable while current is migratable + * - current will be preempted on the next reschedule + * + * we should check to see if current can readily move to a different + * cpu. If so, we will reschedule to allow the push logic to try + * to move current somewhere else, making room for our non-migratable + * task. + */ + if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) + check_preempt_equal_prio(rq, p); +#endif +} + +#ifdef CONFIG_MT_RT_SCHED +/* Return the second highest RT task, NULL otherwise */ +static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, + struct rt_rq *rt_rq) +{ + struct rt_prio_array *array = &rt_rq->active; + struct sched_rt_entity *next = NULL; + struct sched_rt_entity *rt_se; + int idx; + + idx = sched_find_first_bit(array->bitmap); + BUG_ON(idx >= MAX_RT_PRIO); + +next_idx: + list_for_each_entry(rt_se, array->queue + idx, run_list) { + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)){ + next = rt_se; + break; + } + + p = rt_task_of(rt_se); + if ( (!cpu_online(rq->cpu)) || (!test_tsk_need_released(p))) { + next = rt_se; + break; + }else{ + mt_sched_printf(sched_rt_info, "1. pick_next_rt_entity bypass %d %s", p->pid, p->comm); + } + } + if (!next) { + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); + if (idx < MAX_RT_PRIO) + goto next_idx; + } + + return next; +} + +#else +static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, + struct rt_rq *rt_rq) +{ + struct rt_prio_array *array = &rt_rq->active; + struct sched_rt_entity *next = NULL; + struct list_head *queue; + int idx; + + idx = sched_find_first_bit(array->bitmap); + BUG_ON(idx >= MAX_RT_PRIO); + + queue = array->queue + idx; + next = list_entry(queue->next, struct sched_rt_entity, run_list); + + return next; +} +#endif + +static struct task_struct *_pick_next_task_rt(struct rq *rq) +{ + struct sched_rt_entity *rt_se; + struct task_struct *p; + struct rt_rq *rt_rq; + + rt_rq = &rq->rt; + + if (!rt_rq->rt_nr_running) + return NULL; + + if (rt_rq_throttled(rt_rq)){ + /* prevent wdt from RT throttle */ + struct rt_prio_array *array = &rt_rq->active; + int idx = 0, prio = MAX_RT_PRIO- 1 - idx; //WDT priority + + if( test_bit(idx, array->bitmap)){ + list_for_each_entry(rt_se, array->queue + idx, run_list){ + p = rt_task_of(rt_se); + if( (p->rt_priority == prio) && (0 == strncmp(p->comm, "wdtk", 4)) ){ + p->se.exec_start = rq->clock_task; + printk_deferred("sched: unthrottle %s\n", p->comm); + return p; + } + } + } + return NULL; + } + + do { + rt_se = pick_next_rt_entity(rq, rt_rq); +#ifdef CONFIG_MT_RT_SCHED + if(!rt_se){ + mt_sched_printf(sched_rt_info, "_pick_next_task_rt %d:%s:%d:%d:%d", + rq->curr->pid, rq->curr->comm, rq->curr->prio, + test_tsk_need_released(rq->curr), rt_rq->rt_nr_running); + return NULL; + } +#endif + BUG_ON(!rt_se); + rt_rq = group_rt_rq(rt_se); + } while (rt_rq); + + p = rt_task_of(rt_se); + p->se.exec_start = rq->clock_task; + + return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ + struct task_struct *p = _pick_next_task_rt(rq); + + /* The running task is never eligible for pushing */ + if (p) + dequeue_pushable_task(rq, p); + +#ifdef CONFIG_SMP + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); +#endif + + return p; +} + +static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +{ + update_curr_rt(rq); + + /* + * The previous task needs to be made eligible for pushing + * if it is still active + */ +#ifdef CONFIG_MT_RT_SCHED + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1 && !test_tsk_need_released(p)) + enqueue_pushable_task(rq, p); +#else + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); +#endif +} + +#ifdef CONFIG_SMP + +/* Only try algorithms three times */ +#define RT_MAX_TRIES 3 + +static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_running(rq, p) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + return 1; + return 0; +} + +/* Return the second highest RT task, NULL otherwise */ +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) +{ + struct task_struct *next = NULL; + struct sched_rt_entity *rt_se; + struct rt_prio_array *array; + struct rt_rq *rt_rq; + int idx; + + for_each_leaf_rt_rq(rt_rq, rq) { + array = &rt_rq->active; + idx = sched_find_first_bit(array->bitmap); +next_idx: + if (idx >= MAX_RT_PRIO) + continue; + if (next && next->prio <= idx) + continue; + list_for_each_entry(rt_se, array->queue + idx, run_list) { + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + continue; + + p = rt_task_of(rt_se); + if (pick_rt_task(rq, p, cpu)) { + next = p; + break; + } + } + if (!next) { + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); + goto next_idx; + } + } + + return next; +} + +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); + +#ifdef CONFIG_MT_RT_SCHED +static int test_has_highest_prio(int this_cpu) +{ + int cpu, highest_prio; + struct rq *this_rq = cpu_rq(this_cpu), *rq; + int prio = this_rq->curr->prio; + + mt_sched_printf(sched_rt_info, "0. test_has_highest_prio %d:%d:%s:%d %lu", + this_cpu, this_rq->curr->pid, this_rq->curr->comm, prio, + tsk_cpus_allowed(this_rq->curr)->bits[0]); + if (prio >= MAX_RT_PRIO){ + mt_sched_printf(sched_rt_info, "test_has_highest_prio false %d:%d:%s:%d", + this_cpu, this_rq->curr->pid, this_rq->curr->comm, prio); + return 0; + } + + for_each_cpu(cpu, &hmp_fast_cpu_mask) { + if(!cpu_online(cpu)) + continue; + + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(this_rq->curr))) + continue; + + rq = cpu_rq(cpu); + + if(rq->rt.rt_nr_running == 0){ + mt_sched_printf(sched_rt, "test_has_highest_prio true %d", cpu); + return 1; + } + + highest_prio = rq->rt.highest_prio.curr; + + mt_sched_printf(sched_rt_info, "1. test_has_highest_prio %d:%d %d", + cpu, highest_prio, prio); + /* if currenet task's priority is higher than process in big CPU */ + if(prio < highest_prio){ + mt_sched_printf(sched_rt, "test_has_highest_prio true %d:%d:%d", + cpu, highest_prio, prio); + return 1; + } + } + + mt_sched_printf(sched_rt, "test_has_highest_prio false %d:%d:%s:%d", + this_cpu, this_rq->curr->pid, this_rq->curr->comm, prio); + + return 0; +} + +static void release_task_ipi(void *data) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + mt_sched_printf(sched_rt, "1. release_task_ipi %d %lu %d", + cpu, hmp_slow_cpu_mask.bits[0], (int)(long)data); + + /* check if current process is LITTLE */ + if (!cpumask_test_cpu(cpu, &hmp_slow_cpu_mask)) + return; + + /* check if current task is highest_n_tasks? */ + if ( !test_has_highest_prio(cpu)){ + mt_sched_printf(sched_rt_info, "3. release_task_ipi false"); + return; + } + + mt_sched_printf(sched_rt_info, "set_tsk_need_release %d:%s:%d", rq->curr->pid, rq->curr->comm, rq->curr->prio); + set_tsk_need_released(rq->curr); + set_tsk_need_resched(rq->curr); +} + +static DEFINE_PER_CPU(int, mt_need_released); +static int find_highest_prio_in_LITTLE(struct rq *this_rq, int pull) +{ + int cpu, prio, this_cpu = this_rq->cpu, highest_prio; + struct rq *rq = NULL; + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); + + highest_prio = MAX_RT_PRIO; + cpumask_clear(lowest_mask); + + mt_sched_printf(sched_rt_info, "0. find_highest_prio_in_LITTLE %lu %d:%d %d", + hmp_slow_cpu_mask.bits[0], + this_rq->cpu, this_rq->rt.highest_prio.curr, + pull); + for_each_cpu(cpu, &hmp_slow_cpu_mask){ + mt_sched_printf(sched_rt_info, "1. find_highest_prio_in_LITTLE %d %d", cpu, cpu_online(cpu)); + if (!cpu_online(cpu)) + continue; + + rq = cpu_rq(cpu); + if(rq->rt.rt_nr_running == 0) + continue; + + prio = rq->rt.highest_prio.curr; + + mt_sched_printf(sched_rt_info, "2. find_highest_prio_in_LITTLE %d %d %d %lu", + cpu, prio, highest_prio, tsk_cpus_allowed(rq->curr)->bits[0]); + + /* If the highest priority of LITTLE CPU is smaller and equal than current, + * then bypass + */ + if (prio >= this_rq->rt.highest_prio.curr ) + continue; + + /* If the prority of LITTLE CPU is smaller and than highest_prio of LITTLE CPUs */ + if (prio > highest_prio) + continue; + + /* check the affinity */ + if (!cpumask_test_cpu(this_rq->cpu, tsk_cpus_allowed(rq->curr))) + continue; + + if (prio < highest_prio){ + if ( 0 == pull ){ + mt_sched_printf(sched_rt_info, "3. find_highest_prio_in_LITTLE find"); + return 1; + } + + highest_prio = prio; + cpumask_clear(lowest_mask); + } + + cpumask_set_cpu(cpu, lowest_mask); + + mt_sched_printf(sched_rt_info, "2. find_highest_prio_in_LITTLE %d:%d %d %lu", + cpu, prio, highest_prio, lowest_mask->bits[0]); + } + + if (cpumask_empty(lowest_mask)){ + mt_sched_printf(sched_rt_info, "3. find_highest_prio_in_LITTLE not find"); + return 0; + } + + raw_spin_unlock_irq(&this_rq->lock); + per_cpu(mt_need_released, this_cpu) = 1; + for_each_cpu (cpu, lowest_mask) { + rq = cpu_rq(cpu); + mt_sched_printf(sched_rt_info, "4. find_highest_prio_in_LITTLE %d %d", + cpu, rq->rt.highest_prio.curr); + if (highest_prio == rq->rt.highest_prio.curr) { + /* send IPI release */ + mt_sched_printf(sched_rt, "send ipi release to cpu=%d prio=%d", + cpu, rq->rt.highest_prio.curr); + /* the target CPU will execute release_task_ipi */ + smp_call_function_single(cpu, release_task_ipi, (void *)this_cpu, 0); + break; + } + } + + raw_spin_lock_irq(&this_rq->lock); + return 1; +} + +static int find_lowest_rq_in_big(struct task_struct *task, struct cpumask *lowest_mask) +{ + int i, lowest_prio = 0; + struct rq *rq = NULL; + + cpumask_clear(lowest_mask); + mt_sched_printf(sched_rt_info, "0. find_lowest_rq_in_big %lu %d:%s:%d", + (unsigned long)hmp_fast_cpu_mask.bits[0], + task->pid, task->comm, task->prio); + + for_each_cpu(i, &hmp_fast_cpu_mask){ + int prio; + + if (!cpu_online(i)) + continue; + + rq = cpu_rq(i); + prio = rq->rt.highest_prio.curr; + + mt_sched_printf(sched_rt_info, "1. find_lowest_rq_in_big %d:%d %d:%lu", + i, prio, + lowest_prio, (unsigned long)lowest_mask->bits[0]); + + /* If the highest priority of CPU is higher than lowest_prio + * or higher than the task, then bypass + */ + if ((prio < lowest_prio) || (prio <= task->prio)) + continue; + + if (!cpumask_test_cpu(i, tsk_cpus_allowed(task))) + continue; + + /* If the priority lower than lowest_prio */ + if (prio > lowest_prio){ + lowest_prio = prio; + cpumask_clear(lowest_mask); + } + + cpumask_set_cpu(i, lowest_mask); + } + + if (cpumask_empty(lowest_mask)){ + mt_sched_printf(sched_rt_info, "2. find_lowest_rq_in_big not find"); + return 0; + } + + mt_sched_printf(sched_rt_info, "3. find_lowest_rq_in_big find %d:%s:%d %d:%lu", + task->pid, task->comm, task->prio, + lowest_prio, (unsigned long)lowest_mask->bits[0]); + return 1; +} + +static int find_lowest_rq_in_LITTLE(struct task_struct *task, struct cpumask *lowest_mask) +{ + int i, lowest_prio = 0; + struct rq *rq = NULL; + + cpumask_clear(lowest_mask); + mt_sched_printf(sched_rt_info, "0. find_lowest_rq_in_LITTLE %lu %d:%s:%d", + (unsigned long)hmp_slow_cpu_mask.bits[0], + task->pid, task->comm, task->prio); + + for_each_cpu(i, &hmp_slow_cpu_mask){ + int prio; + + if (!cpu_online(i)) + continue; + + rq = cpu_rq(i); + prio = rq->rt.highest_prio.curr; + + mt_sched_printf(sched_rt_info, "1. find_lowest_rq_in_LITTLE %d:%d %d:%lu", + i, prio, + lowest_prio, (unsigned long)lowest_mask->bits[0]); + + /* If the highest priority of CPU is higher than lowest_prio + * or higher than the task, then bypass + */ + if ((prio < lowest_prio) || (prio <= task->prio)) + continue; + + if (!cpumask_test_cpu(i, tsk_cpus_allowed(task))) + continue; + + /* If the priority lower than lowest_prio */ + if (prio > lowest_prio){ + lowest_prio = prio; + cpumask_clear(lowest_mask); + } + + cpumask_set_cpu(i, lowest_mask); + } + + if (cpumask_empty(lowest_mask)){ + mt_sched_printf(sched_rt_info, "2. find_lowest_rq_in_LITTLE not find"); + return 0; + } + + mt_sched_printf(sched_rt_info, "3. find_lowest_rq_in_LITTLE find %d:%s:%d %d:%lu", + task->pid, task->comm, task->prio, + lowest_prio, (unsigned long)lowest_mask->bits[0]); + + return 1; +} +#endif + +#ifdef CONFIG_MT_SCHED_INTEROP +static int mt_sched_interop_rt(int cpu, struct cpumask *lowest_mask) +{ + int lowest_cpu = -1, lowest_prio = 0; + + mt_sched_printf(sched_interop, "current cpu=%d, find idle cpu from cpumask 0x%lx", + cpu, lowest_mask->bits[0]); + + if (cpumask_test_cpu(cpu, lowest_mask)&& idle_cpu(cpu)) + return cpu; + + for_each_cpu(cpu, lowest_mask) { + struct rq *rq; + struct task_struct *curr; + if (idle_cpu(cpu)) { + return cpu; + } + + rq = cpu_rq(cpu); + curr = rq->curr; + if ((curr->sched_class == &fair_sched_class) && (curr->prio > lowest_prio)) { + lowest_prio = curr->prio; + lowest_cpu = cpu; + + mt_sched_printf(sched_interop, "lowest_cpu=%d, lowest_prio=%d", + lowest_cpu, lowest_prio); + } + } + + if (-1 != lowest_cpu){ + return lowest_cpu; + } + + return -1; +} +#endif + +static int find_lowest_rq(struct task_struct *task) +{ + struct sched_domain *sd; + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); + int this_cpu = smp_processor_id(); + int cpu = task_cpu(task); +#ifdef CONFIG_MT_SCHED_INTEROP + int interop_cpu; +#endif + + mt_sched_printf(sched_rt_info, "1 find_lowest_rq lowest_mask=0x%lx, task->cpus_allowed=0x%lx", + lowest_mask->bits[0], task->cpus_allowed.bits[0]); + /* Make sure the mask is initialized first */ + if (unlikely(!lowest_mask)) + return -1; + + if (task->nr_cpus_allowed == 1) + return -1; /* No other targets possible */ + +#ifdef CONFIG_MT_RT_SCHED + if (!find_lowest_rq_in_big(task, lowest_mask)){ + if (!find_lowest_rq_in_LITTLE(task, lowest_mask)){ + return -1; /* No targets found */ + } + } +#else + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + return -1; /* No targets found */ +#endif + +#ifdef CONFIG_MT_SCHED_INTEROP + interop_cpu = mt_sched_interop_rt(cpu, lowest_mask); + if (interop_cpu != -1 ){ + mt_sched_printf(sched_interop, "find idle cpu=%d", interop_cpu); + return interop_cpu; + } +#endif + + /* + * At this point we have built a mask of cpus representing the + * lowest priority tasks in the system. Now we want to elect + * the best one based on our affinity and topology. + * + * We prioritize the last cpu that the task executed on since + * it is most likely cache-hot in that location. + */ + if (cpumask_test_cpu(cpu, lowest_mask)) + return cpu; + + /* + * Otherwise, we consult the sched_domains span maps to figure + * out which cpu is logically closest to our hot cache data. + */ + if (!cpumask_test_cpu(this_cpu, lowest_mask)) + this_cpu = -1; /* Skip this_cpu opt if not among lowest */ + + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; + + /* + * "this_cpu" is cheaper to preempt than a + * remote processor. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return this_cpu; + } + + best_cpu = cpumask_first_and(lowest_mask, + sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) { + rcu_read_unlock(); + return best_cpu; + } + } + } + rcu_read_unlock(); + + /* + * And finally, if there were no matches within the domains + * just give the caller *something* to work with from the compatible + * locations. + */ + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + return -1; +} + +/* Will lock the rq it finds */ +static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) +{ + struct rq *lowest_rq = NULL; + int tries; + int cpu; + + for (tries = 0; tries < RT_MAX_TRIES; tries++) { + cpu = find_lowest_rq(task); + + if ((cpu == -1) || (cpu == rq->cpu)) + break; + + lowest_rq = cpu_rq(cpu); + + /* if the prio of this runqueue changed, try again */ + if (double_lock_balance(rq, lowest_rq)) { + /* + * We had to unlock the run queue. In + * the mean time, task could have + * migrated already or had its affinity changed. + * Also make sure that it wasn't scheduled on its rq. + */ + + mt_sched_printf(sched_rt_info, "1. find_lock_lowest_rq %d %d %d %s", + lowest_rq->cpu, rq->cpu, task->pid, task->comm); + if (unlikely(task_rq(task) != rq || + !cpumask_test_cpu(lowest_rq->cpu, + tsk_cpus_allowed(task)) || + task_running(rq, task) || + !task->on_rq)) { + + double_unlock_balance(rq, lowest_rq); + lowest_rq = NULL; + break; + } + } + + /* If this rq is still suitable use it. */ + if (lowest_rq->rt.highest_prio.curr > task->prio) + break; + + /* try again */ + double_unlock_balance(rq, lowest_rq); + lowest_rq = NULL; + } + + return lowest_rq; +} + +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!p->on_rq); + BUG_ON(!rt_task(p)); + + return p; +} + +#ifdef CONFIG_MT_RT_SCHED +/* Will lock the rq it finds */ +/* refer find_lock_lowest_rq() */ +static struct rq *find_lock_lowest_rq_mtk(struct task_struct *task, struct rq *rq) +{ + struct rq *lowest_rq = NULL; + int cpu; + + cpu = find_lowest_rq(task); + + if ((cpu == -1) || (cpu == rq->cpu)) + return NULL; + + lowest_rq = cpu_rq(cpu); + + /* if the prio of this runqueue changed, try again */ + if (double_lock_balance(rq, lowest_rq)) { + /* + * We had to unlock the run queue. In + * the mean time, task could have + * migrated already or had its affinity changed. + * Also make sure that it wasn't scheduled on its rq. + */ + mt_sched_printf(sched_rt_info, "1. find_lock_lowest_rq_mtk %d %d %d %s", + lowest_rq->cpu, rq->cpu, task->pid, task->comm); + if (unlikely(task_rq(task) != rq || + !cpumask_test_cpu(lowest_rq->cpu, + tsk_cpus_allowed(task)) || + task_running(rq, task) || + !task->on_rq)) { + double_unlock_balance(rq, lowest_rq); + return NULL; + } + } + + /* If this rq is still suitable use it. */ + if (lowest_rq->rt.highest_prio.curr > task->prio){ + return lowest_rq; + } + + double_unlock_balance(rq, lowest_rq); + return NULL; +} +#endif + + +#ifdef CONFIG_MT_RT_SCHED +/* refer push_rt_task() */ +int push_need_released_rt_task(struct rq *rq, struct task_struct *p) +{ + struct rq *lowest_rq; + int ret = 0; + + if (!p) + return 0; + + mt_sched_printf(sched_rt_info, "0. push_need_released_task %d:%s %d:%s", + p->pid, p->comm, rq->curr->pid, rq->curr->comm); + + if (unlikely(p == rq->curr)) { + WARN_ON(1); + return 0; + } + + /* We might release rq lock */ + get_task_struct(p); + + /* find_lock_lowest_rq locks the rq if found */ + lowest_rq = find_lock_lowest_rq_mtk(p, rq); + if (!lowest_rq) { + mt_sched_printf(sched_rt, "1. push_need_released_task fail %d:%s:%d %d", + p->pid, p->comm, p->prio, rq->curr->prio); + put_task_struct(p); + + if (likely(p->prio < rq->curr->prio)) { + resched_task(rq->curr); + }else{ + mt_sched_printf(sched_rt, "1. push_need_released_task fail %d:%s:%d %d", + p->pid, p->comm, p->prio, rq->curr->prio); +#if 0 + printk(KERN_ALERT "[sched] push_need_released_task fail %d:%s:%d %d\n", + p->pid, p->comm, p->prio, rq->curr->prio); + WARN_ON(1); +#endif + } + return 0; + } + + mt_sched_printf(sched_rt, "push_need_released_task task=%d:%s cpu=%d", + p->pid, p->comm, lowest_rq->cpu); + + deactivate_task(rq, p, 0); + set_task_cpu(p, lowest_rq->cpu); + activate_task(lowest_rq, p, 0); + ret = 1; + + resched_task(lowest_rq->curr); + + double_unlock_balance(rq, lowest_rq); + + put_task_struct(p); + + return ret; +} +#endif + +/* + * If the current CPU has more than one RT task, see if the non + * running task can migrate over to a CPU that is running a task + * of lesser priority. + */ +static int push_rt_task(struct rq *rq) +{ + struct task_struct *next_task; + struct rq *lowest_rq; + int ret = 0; + + if (!rq->rt.overloaded) + return 0; + + next_task = pick_next_pushable_task(rq); + if (!next_task) + return 0; + +retry: + if (unlikely(next_task == rq->curr)) { + WARN_ON(1); + return 0; + } + + /* + * It's possible that the next_task slipped in of + * higher priority than current. If that's the case + * just reschedule current. + */ + if (unlikely(next_task->prio < rq->curr->prio)) { + resched_task(rq->curr); + return 0; + } + + /* We might release rq lock */ + get_task_struct(next_task); + + /* find_lock_lowest_rq locks the rq if found */ + lowest_rq = find_lock_lowest_rq(next_task, rq); + if (!lowest_rq) { + struct task_struct *task; + /* + * find_lock_lowest_rq releases rq->lock + * so it is possible that next_task has migrated. + * + * We need to make sure that the task is still on the same + * run-queue and is also still the next task eligible for + * pushing. + */ + task = pick_next_pushable_task(rq); + if (task_cpu(next_task) == rq->cpu && task == next_task) { + /* + * The task hasn't migrated, and is still the next + * eligible task, but we failed to find a run-queue + * to push it to. Do not retry in this case, since + * other cpus will pull from us when ready. + */ + goto out; + } + + if (!task) + /* No more tasks, just exit */ + goto out; + + /* + * Something has shifted, try again. + */ + put_task_struct(next_task); + next_task = task; + goto retry; + } + + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, lowest_rq->cpu); + activate_task(lowest_rq, next_task, 0); + ret = 1; + + resched_task(lowest_rq->curr); + + double_unlock_balance(rq, lowest_rq); + +out: + put_task_struct(next_task); + + return ret; +} + +static void push_rt_tasks(struct rq *rq) +{ + /* push_rt_task will return true if it moved an RT */ + while (push_rt_task(rq)) + ; +} + +#ifdef CONFIG_MT_RT_SCHED +/* refer pull_rt_task() */ +static int pick_next_highest_task(struct rq *this_rq){ + int this_cpu = this_rq->cpu, ret = 0, cpu; + struct task_struct *p; + struct rq *src_rq; + + for_each_cpu(cpu, this_rq->rd->rto_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); + + /* + * Don't bother taking the src_rq->lock if the next highest + * task is known to be lower-priority than our current task. + * This may look racy, but if this value is about to go + * logically higher, the src_rq will push this task away. + * And if its going logically lower, we do not care + */ + if (src_rq->rt.highest_prio.next >= + this_rq->rt.highest_prio.curr) + continue; + + /* + * We can potentially drop this_rq's lock in + * double_lock_balance, and another CPU could + * alter this_rq + */ + double_lock_balance(this_rq, src_rq); + + /* + * Are there still pullable RT tasks? + */ + if (src_rq->rt.rt_nr_running <= 1) + goto skip; + + p = pick_next_highest_task_rt(src_rq, this_cpu); + + /* + * Do we have an RT task that preempts + * the to-be-scheduled task? + */ + if (p && (p->prio < this_rq->rt.highest_prio.curr)) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->on_rq); + + /* + * There's a chance that p is higher in priority + * than what's currently running on its cpu. + * This is just that p is wakeing up and hasn't + * had a chance to schedule. We only pull + * p if it is lower in priority than the + * current task on the run queue + */ + if (p->prio < src_rq->curr->prio) + goto skip; + + ret = 1; + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + /* + * We continue with the search, just in + * case there's an even higher prio task + * in another runqueue. (low likelihood + * but possible) + */ + mt_sched_printf(sched_rt_info, "pick_next_highest_task %d:%d %d %d:%s:%d\n", + this_rq->cpu, this_rq->rt.highest_prio.curr, + src_rq->cpu, + p->pid, p->comm, p->prio); + } +skip: + double_unlock_balance(this_rq, src_rq); + } + + return ret; +} + +void mt_check_rt_policy(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu; + if ( cpumask_test_cpu(this_cpu, &hmp_fast_cpu_mask) ){ + if ( !per_cpu(mt_need_released, this_cpu) ) + return; + mt_sched_printf(sched_rt_info, "0. mt_check_rt_policy %d %d %s", + this_cpu, this_rq->curr->pid, this_rq->curr->comm ); + + if ( find_highest_prio_in_LITTLE(this_rq, 0) ){ + set_tsk_need_resched(this_rq->curr); + mt_sched_printf(sched_rt_info, "1. mt_check_rt_policy %d %d %s", + this_cpu, this_rq->curr->pid, this_rq->curr->comm ); + }else{ + per_cpu(mt_need_released, this_cpu)=0; + mt_sched_printf(sched_rt_info, "2. mt_check_rt_policy %d", this_cpu); + } + } +} + +int mt_post_schedule(struct rq *rq) +{ + int this_cpu = rq->cpu, ret = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + if ( cpumask_test_cpu(this_cpu, &hmp_fast_cpu_mask) ) { + if ( has_rt_task_in_little() ) + ret = find_highest_prio_in_LITTLE(rq, 1); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return ret; +} +#endif + +#ifdef CONFIG_MT_RT_SCHED +int pull_rt_task(struct rq *this_rq) +#else +static int pull_rt_task(struct rq *this_rq) +#endif +{ +#if !defined(CONFIG_MT_RT_SCHED) + int this_cpu = this_rq->cpu; +#endif + int ret = 0; +#ifndef CONFIG_MT_RT_SCHED + int cpu; + struct task_struct *p; + struct rq *src_rq; +#endif + + mt_sched_printf(sched_rt_info, "0. pull_rt_task %d %d ", + rt_overloaded(this_rq), this_rq->cpu); + +#ifdef CONFIG_MT_RT_SCHED + if (likely(!rt_overloaded(this_rq))) + return 0; + ret = pick_next_highest_task(this_rq); +#else + if (likely(!rt_overloaded(this_rq))) + return 0; + + mt_sched_printf(sched_rt_info, "1. pull_rt_task %lu ", + this_rq->rd->rto_mask->bits[0]); + for_each_cpu(cpu, this_rq->rd->rto_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); + + /* + * Don't bother taking the src_rq->lock if the next highest + * task is known to be lower-priority than our current task. + * This may look racy, but if this value is about to go + * logically higher, the src_rq will push this task away. + * And if its going logically lower, we do not care + */ + mt_sched_printf(sched_rt_info, "2. pull_rt_task %d %d ", + src_rq->rt.highest_prio.next, this_rq->rt.highest_prio.curr); + if (src_rq->rt.highest_prio.next >= + this_rq->rt.highest_prio.curr) + continue; + + /* + * We can potentially drop this_rq's lock in + * double_lock_balance, and another CPU could + * alter this_rq + */ + double_lock_balance(this_rq, src_rq); + + /* + * Are there still pullable RT tasks? + */ + if (src_rq->rt.rt_nr_running <= 1) + goto skip; + + p = pick_next_highest_task_rt(src_rq, this_cpu); + + /* + * Do we have an RT task that preempts + * the to-be-scheduled task? + */ + if (p && (p->prio < this_rq->rt.highest_prio.curr)) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->on_rq); + + /* + * There's a chance that p is higher in priority + * than what's currently running on its cpu. + * This is just that p is wakeing up and hasn't + * had a chance to schedule. We only pull + * p if it is lower in priority than the + * current task on the run queue + */ + if (p->prio < src_rq->curr->prio) + goto skip; + + ret = 1; + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + /* + * We continue with the search, just in + * case there's an even higher prio task + * in another runqueue. (low likelihood + * but possible) + */ + } +skip: + double_unlock_balance(this_rq, src_rq); + } +#endif + + return ret; +} + +static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + if (rq->rt.highest_prio.curr > prev->prio) + pull_rt_task(rq); +} + +static void post_schedule_rt(struct rq *rq) +{ + push_rt_tasks(rq); +} + +/* + * If we are not running and we are not going to reschedule soon, we should + * try to push tasks away now + */ +static void task_woken_rt(struct rq *rq, struct task_struct *p) +{ + if (!task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + has_pushable_tasks(rq) && + p->nr_cpus_allowed > 1 && + rt_task(rq->curr) && + (rq->curr->nr_cpus_allowed < 2 || + rq->curr->prio <= p->prio)) + push_rt_tasks(rq); +} + +static void set_cpus_allowed_rt(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq; + int weight; + + BUG_ON(!rt_task(p)); + + if (!p->on_rq) + return; + + weight = cpumask_weight(new_mask); + + /* + * Only update if the process changes its state from whether it + * can migrate or not. + */ + if ((p->nr_cpus_allowed > 1) == (weight > 1)) + return; + + rq = task_rq(p); + + /* + * The process used to be able to migrate OR it can now migrate + */ + if (weight <= 1) { + if (!task_current(rq, p)) + dequeue_pushable_task(rq, p); + BUG_ON(!rq->rt.rt_nr_migratory); + rq->rt.rt_nr_migratory--; + } else { + if (!task_current(rq, p)) + enqueue_pushable_task(rq, p); + rq->rt.rt_nr_migratory++; + } + + update_rt_migration(&rq->rt); +} + +/* Assumes rq->lock is held */ +static void rq_online_rt(struct rq *rq) +{ + if (rq->rt.overloaded) + rt_set_overload(rq); + + __enable_runtime(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); +} + +/* Assumes rq->lock is held */ +static void rq_offline_rt(struct rq *rq) +{ + if (rq->rt.overloaded) + rt_clear_overload(rq); + + __disable_runtime(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); +} + +/* + * When switch from the rt queue, we bring ourselves to a position + * that we might want to pull RT tasks from other runqueues. + */ +static void switched_from_rt(struct rq *rq, struct task_struct *p) +{ + /* + * If there are other RT tasks then we will reschedule + * and the scheduling of the other RT tasks will handle + * the balancing. But if we are the last RT task + * we may need to handle the pulling of RT tasks + * now. + */ + if (!p->on_rq || rq->rt.rt_nr_running) + return; + + if (pull_rt_task(rq)) + resched_task(rq->curr); +} + +void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) { + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + GFP_KERNEL, cpu_to_node(i)); + } +} +#endif /* CONFIG_SMP */ + +/* + * When switching a task to RT, we may overload the runqueue + * with RT tasks. In this case we try to push them off to + * other runqueues. + */ +static void switched_to_rt(struct rq *rq, struct task_struct *p) +{ + int check_resched = 1; + + /* + * If we are already running, then there's nothing + * that needs to be done. But if we are not running + * we may need to preempt the current running task. + * If that current running task is also an RT task + * then see if we can move to another run queue. + */ + if (p->on_rq && rq->curr != p) { +#ifdef CONFIG_SMP + if (rq->rt.overloaded && push_rt_task(rq) && + /* Don't resched if we changed runqueues */ + rq != task_rq(p)) + check_resched = 0; +#endif /* CONFIG_SMP */ + if (check_resched && p->prio < rq->curr->prio) + resched_task(rq->curr); + } +} + +/* + * Priority of the task has changed. This may cause + * us to initiate a push or pull. + */ +static void +prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) +{ + if (!p->on_rq) + return; + + if (rq->curr == p) { +#ifdef CONFIG_SMP + /* + * If our priority decreases while running, we + * may need to pull tasks to this runqueue. + */ + if (oldprio < p->prio) + pull_rt_task(rq); + /* + * If there's a higher priority task waiting to run + * then reschedule. Note, the above pull_rt_task + * can release the rq lock and p could migrate. + * Only reschedule if p is still on the same runqueue. + */ + if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) + resched_task(p); +#else + /* For UP simply resched on drop of prio */ + if (oldprio < p->prio) + resched_task(p); +#endif /* CONFIG_SMP */ + } else { + /* + * This task is not running, but if it is + * greater than the current running task + * then reschedule. + */ + if (p->prio < rq->curr->prio) + resched_task(rq->curr); + } +} + +static void watchdog(struct rq *rq, struct task_struct *p) +{ + unsigned long soft, hard; + + /* max may change after cur was read, this will be fixed next tick */ + soft = task_rlimit(p, RLIMIT_RTTIME); + hard = task_rlimit_max(p, RLIMIT_RTTIME); + + if (soft != RLIM_INFINITY) { + unsigned long next; + + if (p->rt.watchdog_stamp != jiffies) { + p->rt.timeout++; + p->rt.watchdog_stamp = jiffies; + } + + next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); + if (p->rt.timeout > next) + p->cputime_expires.sched_exp = p->se.sum_exec_runtime; + } +} + +static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) +{ + struct sched_rt_entity *rt_se = &p->rt; + + update_curr_rt(rq); + + watchdog(rq, p); + + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if (p->policy != SCHED_RR) + return; + + if (--p->rt.time_slice) + return; + + p->rt.time_slice = sched_rr_timeslice; + + /* + * Requeue to the end of queue if we (and all of our ancestors) are the + * only element on the queue + */ + for_each_sched_rt_entity(rt_se) { + if (rt_se->run_list.prev != rt_se->run_list.next) { + requeue_task_rt(rq, p, 0); + set_tsk_need_resched(p); + return; + } + } +} + +static void set_curr_task_rt(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq->clock_task; + + /* The running task is never eligible for pushing */ + dequeue_pushable_task(rq, p); +} + +static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) +{ + /* + * Time slice is 0 for SCHED_FIFO tasks + */ + if (task->policy == SCHED_RR) + return sched_rr_timeslice; + else + return 0; +} + +const struct sched_class rt_sched_class = { + .next = &fair_sched_class, + .enqueue_task = enqueue_task_rt, + .dequeue_task = dequeue_task_rt, + .yield_task = yield_task_rt, + + .check_preempt_curr = check_preempt_curr_rt, + + .pick_next_task = pick_next_task_rt, + .put_prev_task = put_prev_task_rt, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_rt, + + .set_cpus_allowed = set_cpus_allowed_rt, + .rq_online = rq_online_rt, + .rq_offline = rq_offline_rt, + .pre_schedule = pre_schedule_rt, + .post_schedule = post_schedule_rt, + .task_woken = task_woken_rt, + .switched_from = switched_from_rt, +#endif + + .set_curr_task = set_curr_task_rt, + .task_tick = task_tick_rt, + + .get_rr_interval = get_rr_interval_rt, + + .prio_changed = prio_changed_rt, + .switched_to = switched_to_rt, +}; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); + +void print_rt_stats(struct seq_file *m, int cpu) +{ + rt_rq_iter_t iter; + struct rt_rq *rt_rq; + + rcu_read_lock(); + for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) + print_rt_rq(m, cpu, rt_rq); + rcu_read_unlock(); +} +#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h new file mode 100644 index 000000000..db799f57e --- /dev/null +++ b/kernel/sched/sched.h @@ -0,0 +1,1427 @@ + +#include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/stop_machine.h> +#include <linux/tick.h> + +#include "cpupri.h" +#include "cpuacct.h" + +extern __read_mostly int scheduler_running; + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +extern unsigned long get_cpu_load(int cpu); +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. + */ +#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ +# define SCHED_LOAD_RESOLUTION 10 +# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION 0 +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) +#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) + +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT + +/* + * These are the 'tuning knobs' of the scheduler: + */ + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +static inline int rt_policy(int policy) +{ + if (policy == SCHED_FIFO || policy == SCHED_RR) + return 1; + return 0; +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +extern struct mutex sched_domains_mutex; + +#ifdef CONFIG_CGROUP_SCHED + +#include <linux/cgroup.h> + +struct cfs_rq; +struct rt_rq; + +extern struct list_head task_groups; + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota, runtime; + s64 hierarchal_quota; + u64 runtime_expires; + + int idle, timer_active; + struct hrtimer period_timer, slack_timer; + struct list_head throttled_cfs_rq; + + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; +#endif +}; + +/* task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; + +#ifdef CONFIG_SMP + atomic_long_t load_avg; + atomic_t runnable_avg, usage_avg; +#endif +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + + struct cfs_bandwidth cfs_bandwidth; +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) +#endif + +typedef int (*tg_visitor)(struct task_group *, void *); + +extern int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + +extern int tg_nop(struct task_group *tg, void *data); + +extern void free_fair_sched_group(struct task_group *tg); +extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + +extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); + +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); +extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent); + +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +#endif + +#else /* CONFIG_CGROUP_SCHED */ + +struct cfs_bandwidth { }; + +#endif /* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned int nr_running, h_nr_running; + + u64 exec_clock; + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr, *next, *last, *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_SMP + /* + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). + */ + unsigned long runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter; + u64 last_decay; + atomic_long_t removed_load; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* Required to track per-cpu representation of a task_group */ + u32 tg_runnable_contrib, tg_usage_contrib; + unsigned long tg_load_contrib; +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + struct sched_avg avg; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_clock, throttled_clock_task; + u64 throttled_clock_task_time; + int throttled, throttle_count; + struct list_head throttled_list; +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +}; + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned int rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; +#endif + int rt_throttled; + int rt_disable_borrow; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; +#endif +}; + +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; +}; + +extern struct root_domain def_root_domain; + +#endif /* CONFIG_SMP */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned int nr_running; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; +#ifdef CONFIG_NO_HZ_COMMON + u64 nohz_stamp; + unsigned long nohz_flags; +#endif +#ifdef CONFIG_NO_HZ_FULL + unsigned long last_sched_tick; +#endif + int skip_clock_update; + + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + struct rt_rq rt; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; +#ifdef CONFIG_SMP + unsigned long h_load_throttle; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED + struct list_head leaf_rt_rq_list; +#endif + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr, *idle, *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; + + u64 clock; + u64 clock_task; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain *sd; + + unsigned long cpu_power; + + unsigned char idle_balance; + /* For active balancing */ + int post_schedule; + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; +#ifdef CONFIG_SCHED_HMP + struct task_struct *migrate_task; +#endif + /* cpu of this runqueue: */ + int cpu; + int online; + + struct list_head cfs_tasks; + + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif + struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif + + struct sched_avg avg; +}; + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +DECLARE_PER_CPU(struct rq, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) + +#ifdef CONFIG_SMP + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + __sd; __sd = __sd->parent) + +#define for_each_lower_domain(sd) for (; sd; sd = sd->child) + +/** + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu: The cpu whose highest level of sched domain is to + * be returned. + * @flag: The flag to check for the highest sched_domain + * for the given cpu. + * + * Returns the highest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *highest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd, *hsd = NULL; + + for_each_domain(cpu, sd) { + if (!(sd->flags & flag)) + break; + hsd = sd; + } + + return hsd; +} + +DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_id); + +struct sched_group_power { + atomic_t ref; + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. + */ + unsigned int power, power_orig; + unsigned long next_update; + /* + * Number of busy cpus in this group. + */ + atomic_t nr_busy_cpus; + + unsigned long cpumask[0]; /* iteration mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + + unsigned int group_weight; + struct sched_group_power *sgp; + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[0]; +}; + +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgp->cpumask); +} + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + +extern int group_balance_cpu(struct sched_group *sg); + +#ifdef CONFIG_SCHED_HMP +static LIST_HEAD(hmp_domains); +DECLARE_PER_CPU(struct hmp_domain *, hmp_cpu_domain); +#define hmp_cpu_domain(cpu) (per_cpu(hmp_cpu_domain, (cpu))) +#endif /* CONFIG_SCHED_HMP */ + +#endif /* CONFIG_SMP */ + +#include "stats.h" +#include "auto_group.h" + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We cannot use task_subsys_state() and friends because the cgroup + * subsystem changes that value before the cgroup_subsys::attach() method + * is called, therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + return p->sched_task_group; +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) + struct task_group *tg = task_group(p); +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = tg->cfs_rq[cpu]; + p->se.parent = tg->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = tg->rt_rq[cpu]; + p->rt.parent = tg->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; + +#ifdef CONFIG_FAIR_GROUP_SCHED + BUG_ON(p->se.cfs_rq->rq->cpu != cpu); +#endif + +#endif +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# include <linux/static_key.h> +# define const_debug __read_mostly +#else +# define const_debug const +#endif + +extern const_debug unsigned int sysctl_sched_features; + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "features.h" + __SCHED_FEAT_NR, +}; + +#undef SCHED_FEAT + +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) +static __always_inline bool static_branch__true(struct static_key *key) +{ + return static_key_true(key); /* Not out of line branch. */ +} + +static __always_inline bool static_branch__false(struct static_key *key) +{ + return static_key_false(key); /* Out of line branch. */ +} + +#define SCHED_FEAT(name, enabled) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ +{ \ + return static_branch__##enabled(key); \ +} + +#include "features.h" + +#undef SCHED_FEAT + +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; +#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) +#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ + +#ifdef CONFIG_NUMA_BALANCING +#define sched_feat_numa(x) sched_feat(x) +#ifdef CONFIG_SCHED_DEBUG +#define numabalancing_enabled sched_feat_numa(NUMA) +#else +extern bool numabalancing_enabled; +#endif /* CONFIG_SCHED_DEBUG */ +#else +#define sched_feat_numa(x) (0) +#define numabalancing_enabled (0) +#endif /* CONFIG_NUMA_BALANCING */ + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + + + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif + +#ifdef CONFIG_MT_RT_SCHED +extern void mt_check_rt_policy(struct rq *this_rq); +extern int push_need_released_rt_task(struct rq *rq, struct task_struct *p); +extern int pull_rt_task(struct rq *this_rq); +extern int mt_post_schedule(struct rq *rq); +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); +#ifdef CONFIG_MT_RT_SCHED + if(test_tsk_need_released(prev)){ + clear_tsk_need_released(prev); + push_need_released_rt_task(rq, prev); + } +#endif + raw_spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif + raw_spin_unlock(&rq->lock); +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif + local_irq_enable(); +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x4 /* internal use, task got migrated */ + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif + +#define DEQUEUE_SLEEP 1 + +struct sched_class { + const struct sched_class *next; + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + + struct task_struct * (*pick_next_task) (struct rq *rq); + void (*put_prev_task) (struct rq *rq, struct task_struct *p); + +#ifdef CONFIG_SMP + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + + void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + void (*post_schedule) (struct rq *this_rq); + void (*task_waking) (struct task_struct *task); + void (*task_woken) (struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, + const struct cpumask *newmask); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); +#endif + + void (*set_curr_task) (struct rq *rq); + void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); + void (*task_fork) (struct task_struct *p); + + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + + unsigned int (*get_rr_interval) (struct rq *rq, + struct task_struct *task); + +#ifdef CONFIG_FAIR_GROUP_SCHED + void (*task_move_group) (struct task_struct *p, int on_rq); +#endif +}; + +#define sched_class_highest (&stop_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) + +extern const struct sched_class stop_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; + + +#ifdef CONFIG_SMP + +extern void update_group_power(struct sched_domain *sd, int cpu); + +extern void trigger_load_balance(struct rq *rq, int cpu); +extern void idle_balance(int this_cpu, struct rq *this_rq); + +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); + + +# ifdef CONFIG_MTK_SCHED_CMP_TGS +extern int group_leader_is_empty(struct task_struct *p); +# endif /* CONFIG_MTK_SCHED_CMP_TGS */ + +# ifdef CONFIG_MTK_SCHED_CMP +extern void get_cluster_cpus(struct cpumask *cpus, int cluster_id, + bool exclusive_offline); +extern int get_cluster_id(unsigned int cpu); +# endif /* CONFIG_MTK_SCHED_CMP */ + +#else /* CONFIG_SMP */ + +static inline void idle_balance(int cpu, struct rq *rq) +{ +} + +#endif + +extern void sysrq_sched_debug_show(void); +extern void sched_init_granularity(void); +#if defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK) || defined (CONFIG_HMP_PACK_SMALL_TASK) +extern void update_packing_domain(int cpu); +#endif /* CONFIG_HMP_PACK_SMALL_TASK */ +extern void update_max_interval(void); +extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); +extern void init_sched_rt_class(void); +extern void init_sched_fair_class(void); + +extern void resched_task(struct task_struct *p); +extern void resched_cpu(int cpu); + +extern struct rt_bandwidth def_rt_bandwidth; +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +extern void update_idle_cpu_load(struct rq *this_rq); + +extern void init_task_runnable_average(struct task_struct *p); + +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ + if (unlikely(steal > NSEC_PER_SEC)) + return div_u64(steal, TICK_NSEC); + + return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + +static inline void inc_nr_running(struct rq *rq) +{ +#ifdef CONFIG_MTK_SCHED_RQAVG_KS + sched_update_nr_prod(cpu_of(rq), rq->nr_running, true); +#endif /* CONFIG_MTK_SCHED_RQAVG_KS */ + rq->nr_running++; + +#ifdef CONFIG_NO_HZ_FULL + if (rq->nr_running == 2) { + if (tick_nohz_full_cpu(rq->cpu)) { + /* Order rq->nr_running write against the IPI */ + smp_wmb(); + smp_send_reschedule(rq->cpu); + } + } +#endif +} + +static inline void dec_nr_running(struct rq *rq) +{ +#ifdef CONFIG_MTK_SCHED_RQAVG_KS + sched_update_nr_prod(cpu_of(rq), rq->nr_running, false); +#endif /* CONFIG_MTK_SCHED_RQAVG_KS */ + rq->nr_running--; +} + +static inline void rq_last_tick_reset(struct rq *rq) +{ +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = jiffies; +#endif +} + +extern void update_rq_clock(struct rq *rq); + +extern void activate_task(struct rq *rq, struct task_struct *p, int flags); +extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +extern const_debug unsigned int sysctl_sched_time_avg; +extern const_debug unsigned int sysctl_sched_nr_migrate; +extern const_debug unsigned int sysctl_sched_migration_cost; + +static inline u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +#ifdef CONFIG_SCHED_HRTICK + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +void hrtick_start(struct rq *rq, u64 delay); + +#else + +static inline int hrtick_enabled(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HRTICK */ + +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} +#else +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } +static inline void sched_avg_update(struct rq *rq) { } +#endif + +extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); + +#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT + +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!raw_spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); + ret = 1; + } else + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); + } + return ret; +} + +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + raw_spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + raw_spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + +#endif + +extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); +extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); +extern void print_cfs_stats(struct seq_file *m, int cpu); +extern void print_rt_stats(struct seq_file *m, int cpu); + +extern void init_cfs_rq(struct cfs_rq *cfs_rq); +extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); +extern void unthrottle_offline_rt_rqs(struct rq *rq); + +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void); + +#ifdef CONFIG_NO_HZ_COMMON +enum rq_nohz_flag_bits { + NOHZ_TICK_STOPPED, + NOHZ_BALANCE_KICK, +}; + +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +DECLARE_PER_CPU(u64, cpu_hardirq_time); +DECLARE_PER_CPU(u64, cpu_softirq_time); + +#ifndef CONFIG_64BIT +DECLARE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_SMP +static inline int rq_cpu(const struct rq *rq) { return rq->cpu; } +#else +static inline int rq_cpu(const struct rq *rq) { return 0; } +#endif + diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c new file mode 100644 index 000000000..da98af347 --- /dev/null +++ b/kernel/sched/stats.c @@ -0,0 +1,145 @@ + +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "sched.h" + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 15 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; + char *mask_str = kmalloc(mask_len, GFP_KERNEL); + + if (mask_str == NULL) + return -ENOMEM; + + if (v == (void *)1) { + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + } else { + struct rq *rq; +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcount = 0; +#endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %u 0 %u %u %u %u %llu %llu %lu", + cpu, rq->yld_count, + rq->sched_count, rq->sched_goidle, + rq->ttwu_count, rq->ttwu_local, + rq->rq_cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { + enum cpu_idle_type itype; + + cpumask_scnprintf(mask_str, mask_len, + sched_domain_span(sd)); + seq_printf(seq, "domain%d %s", dcount++, mask_str); + for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %u %u %u %u %u %u %u %u", + sd->lb_count[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, + " %u %u %u %u %u %u %u %u %u %u %u %u\n", + sd->alb_count, sd->alb_failed, sd->alb_pushed, + sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); + } + rcu_read_unlock(); +#endif + } + kfree(mask_str); + return 0; +} + +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *schedstat_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; + + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return schedstat_start(file, offset); +} + +static void schedstat_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations schedstat_sops = { + .start = schedstat_start, + .next = schedstat_next, + .stop = schedstat_stop, + .show = show_schedstat, +}; + +static int schedstat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &schedstat_sops); +} + +static const struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_schedstat_init(void) +{ + proc_create("schedstat", 0, NULL, &proc_schedstat_operations); + return 0; +} +module_init(proc_schedstat_init); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h new file mode 100644 index 000000000..2ef90a51e --- /dev/null +++ b/kernel/sched/stats.h @@ -0,0 +1,231 @@ + +#ifdef CONFIG_SCHEDSTATS + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta; + rq->rq_sched_info.pcount++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_cpu_time += delta; +} + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) +# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) +# define schedstat_set(var, val) do { var = (val); } while (0) +#else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{} +# define schedstat_inc(rq, field) do { } while (0) +# define schedstat_add(rq, field, amt) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +#endif + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * We are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a cpu, we call this routine + * from dequeue_task() to account for possible rq->clock skew across cpus. The + * delta taken on each cpu would annul the skew. + */ +static inline void sched_info_dequeued(struct task_struct *t) +{ + unsigned long long now = task_rq(t)->clock, delta = 0; + + if (unlikely(sched_info_on())) + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + + rq_sched_info_dequeued(task_rq(t), delta); +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(struct task_struct *t) +{ + unsigned long long now = task_rq(t)->clock, delta = 0; + + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + t->sched_info.last_arrival = now; + t->sched_info.pcount++; + + rq_sched_info_arrive(task_rq(t), delta); +} + +/* + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct task_struct *t) +{ + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = task_rq(t)->clock; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. + */ +static inline void sched_info_depart(struct task_struct *t) +{ + unsigned long long delta = task_rq(t)->clock - + t->sched_info.last_arrival; + + rq_sched_info_depart(task_rq(t), delta); + + if (t->state == TASK_RUNNING) + sched_info_queued(t); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + struct rq *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); +} +#else +#define sched_info_queued(t) do { } while (0) +#define sched_info_reset_dequeued(t) do { } while (0) +#define sched_info_dequeued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; + + raw_spin_lock(&cputimer->lock); + cputimer->cputime.utime += cputime; + raw_spin_unlock(&cputimer->lock); +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; + + raw_spin_lock(&cputimer->lock); + cputimer->cputime.stime += cputime; + raw_spin_unlock(&cputimer->lock); +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk: Pointer to task structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; + + raw_spin_lock(&cputimer->lock); + cputimer->cputime.sum_exec_runtime += ns; + raw_spin_unlock(&cputimer->lock); +} diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c new file mode 100644 index 000000000..656a9e372 --- /dev/null +++ b/kernel/sched/stop_task.c @@ -0,0 +1,143 @@ +#include "sched.h" +#ifdef CONFIG_HMP_TRACER +#include <trace/events/sched.h> +#endif + +/* + * stop-task scheduling class. + * + * The stop task is the highest priority task in the system, it preempts + * everything and will be preempted by nothing. + * + * See kernel/stop_machine.c + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) +{ + return task_cpu(p); /* stop tasks as never migrate */ +} +#endif /* CONFIG_SMP */ + +static void +check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +{ + /* we're never preempted */ +} + +static struct task_struct *pick_next_task_stop(struct rq *rq) +{ + struct task_struct *stop = rq->stop; + + if (stop && stop->on_rq) { + stop->se.exec_start = rq->clock_task; + return stop; + } + + return NULL; +} + +static void +enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ + inc_nr_running(rq); +#ifdef CONFIG_HMP_TRACER + trace_sched_runqueue_length(rq->cpu,rq->nr_running); +#endif + #ifdef CONFIG_MET_SCHED_HMP + RqLen(smp_processor_id(),rq->nr_running); + #endif +} + +static void +dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ + dec_nr_running(rq); +#ifdef CONFIG_HMP_TRACER + trace_sched_runqueue_length(rq->cpu,rq->nr_running); +#endif + #ifdef CONFIG_MET_SCHED_HMP + RqLen(smp_processor_id(),rq->nr_running); + #endif +} + +static void yield_task_stop(struct rq *rq) +{ + BUG(); /* the stop task should never yield, its pointless. */ +} + +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +{ + struct task_struct *curr = rq->curr; + u64 delta_exec; + + delta_exec = rq->clock_task - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq->clock_task; + cpuacct_charge(curr, delta_exec); +} + +static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_stop(struct rq *rq) +{ + struct task_struct *stop = rq->stop; + + stop->se.exec_start = rq->clock_task; +} + +static void switched_to_stop(struct rq *rq, struct task_struct *p) +{ + BUG(); /* its impossible to change to this class */ +} + +static void +prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) +{ + BUG(); /* how!?, what priority? */ +} + +static unsigned int +get_rr_interval_stop(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +/* + * Simple, special scheduling class for the per-CPU stop tasks: + */ +const struct sched_class stop_sched_class = { + .next = &rt_sched_class, + + .enqueue_task = enqueue_task_stop, + .dequeue_task = dequeue_task_stop, + .yield_task = yield_task_stop, + + .check_preempt_curr = check_preempt_curr_stop, + + .pick_next_task = pick_next_task_stop, + .put_prev_task = put_prev_task_stop, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_stop, +#endif + + .set_curr_task = set_curr_task_stop, + .task_tick = task_tick_stop, + + .get_rr_interval = get_rr_interval_stop, + + .prio_changed = prio_changed_stop, + .switched_to = switched_to_stop, +}; |
