diff --git a/init/Kconfig b/init/Kconfig index 223c27ff7903..0f8ce0c5e876 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -890,6 +890,16 @@ config SOCK_CGROUP_DATA endif # CGROUPS +config SCHED_CORE_CTL + bool "QTI Core Control" + depends on SMP + help + This options enables the core control functionality in + the scheduler. Core control automatically offline and + online cores based on cpu load and utilization. + + If unsure, say N here. + config CHECKPOINT_RESTORE bool "Checkpoint/restore support" if EXPERT select PROC_CHILDREN diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index dc54a455a20a..3c5aed007b4c 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -30,3 +30,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o +obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c new file mode 100644 index 000000000000..8f071757d516 --- /dev/null +++ b/kernel/sched/core_ctl.c @@ -0,0 +1,1014 @@ +/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MAX_CPUS_PER_GROUP 4 + +struct cpu_data { + /* Per CPU data. */ + bool inited; + bool online; + bool rejected; + bool is_busy; + bool not_preferred; + unsigned int busy; + unsigned int cpu; + struct list_head sib; + unsigned int first_cpu; + + /* Per cluster data set only on first CPU */ + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_GROUP]; + unsigned int busy_down_thres[MAX_CPUS_PER_GROUP]; + unsigned int online_cpus; + unsigned int avail_cpus; + unsigned int num_cpus; + unsigned int need_cpus; + unsigned int task_thres; + s64 need_ts; + struct list_head lru; + bool pending; + spinlock_t pending_lock; + bool is_big_cluster; + int nrrun; + bool nrrun_changed; + struct timer_list timer; + struct task_struct *hotplug_thread; + struct kobject kobj; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cpu_data *f); +static void wake_up_hotplug_thread(struct cpu_data *state); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->max_cpus); + wake_up_hotplug_thread(state); + + return count; +} + +static ssize_t show_min_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + val = min(val, state->num_cpus); + state->max_cpus = val; + state->min_cpus = min(state->min_cpus, state->max_cpus); + wake_up_hotplug_thread(state); + + return count; +} + +static ssize_t show_max_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(struct cpu_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(struct cpu_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += snprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + count += snprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_is_big_cluster(struct cpu_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->is_big_cluster = val ? 1 : 0; + return count; +} + +static ssize_t show_is_big_cluster(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster); +} + +static ssize_t show_cpus(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry(c, &state->lru, sib) { + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u (%s)\n", c->cpu, + c->online ? "Online" : "Offline"); + } + spin_unlock_irqrestore(&state_lock, flags); + return count; +} + +static ssize_t show_need_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_online_cpus(struct cpu_data *state, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", state->online_cpus); +} + +static ssize_t show_global_state(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + count += snprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + c = &per_cpu(cpu_state, cpu); + if (!c->inited) + continue; + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", c->online); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tRejected: %u\n", c->rejected); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", c->first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + if (c->cpu != c->first_cpu) + continue; + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", c->nrrun); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tAvail CPUs: %u\n", c->avail_cpus); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", c->need_cpus); + } + + return count; +} + +static ssize_t store_not_preferred(struct cpu_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i, first_cpu; + unsigned int val[MAX_CPUS_PER_GROUP]; + int ret; + + ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + first_cpu = state->first_cpu; + + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, first_cpu); + c->not_preferred = val[i]; + first_cpu++; + } + + return count; +} + +static ssize_t show_not_preferred(struct cpu_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned int i, first_cpu; + + first_cpu = state->first_cpu; + + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, first_cpu); + count += snprintf(buf + count, PAGE_SIZE - count, + "\tCPU:%d %u\n", first_cpu, c->not_preferred); + first_cpu++; + } + + return count; +} + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(struct cpu_data *, char *); + ssize_t (*store)(struct cpu_data *, const char *, size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(is_big_cluster); +core_ctl_attr_ro(cpus); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(online_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &is_big_cluster.attr, + &cpus.attr, + &need_cpus.attr, + &online_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cpu_data(k) container_of(k, struct cpu_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cpu_data *data = to_cpu_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cpu_data *data = to_cpu_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +#define RQ_AVG_TOLERANCE 2 +#define RQ_AVG_DEFAULT_MS 20 +#define NR_RUNNING_TOLERANCE 5 +static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS; + +static s64 rq_avg_timestamp_ms; +static struct timer_list rq_avg_timer; + +static void update_running_avg(bool trigger_update) +{ + int cpu; + struct cpu_data *pcpu; + int avg, iowait_avg, big_avg, old_nrrun; + s64 now; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + + now = ktime_to_ms(ktime_get()); + if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) { + spin_unlock_irqrestore(&state_lock, flags); + return; + } + rq_avg_timestamp_ms = now; + sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg); + + spin_unlock_irqrestore(&state_lock, flags); + + /* + * Round up to the next integer if the average nr running tasks + * is within NR_RUNNING_TOLERANCE/100 of the next integer. + * If normal rounding up is used, it will allow a transient task + * to trigger online event. By the time core is onlined, the task + * has finished. + * Rounding to closest suffers same problem because scheduler + * might only provide running stats per jiffy, and a transient + * task could skew the number for one jiffy. If core control + * samples every 2 jiffies, it will observe 0.5 additional running + * average which rounds up to 1 task. + */ + avg = (avg + NR_RUNNING_TOLERANCE) / 100; + big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100; + + for_each_possible_cpu(cpu) { + pcpu = &per_cpu(cpu_state, cpu); + if (!pcpu->inited || pcpu->first_cpu != cpu) + continue; + old_nrrun = pcpu->nrrun; + /* + * Big cluster only need to take care of big tasks, but if + * there are not enough big cores, big tasks need to be run + * on little as well. Thus for little's runqueue stat, it + * has to use overall runqueue average, or derive what big + * tasks would have to be run on little. The latter approach + * is not easy to get given core control reacts much slower + * than scheduler, and can't predict scheduler's behavior. + */ + pcpu->nrrun = pcpu->is_big_cluster ? big_avg : avg; + if (pcpu->nrrun != old_nrrun) { + if (trigger_update) + apply_need(pcpu); + else + pcpu->nrrun_changed = true; + } + } +} + +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(struct cpu_data *f, unsigned int new_need) +{ + /* Online all cores if there are enough tasks */ + if (f->nrrun >= f->task_thres) + return f->num_cpus; + + /* only online more cores if there are tasks to run */ + if (f->nrrun > new_need) + return new_need + 1; + + return new_need; +} + +static u64 round_to_nw_start(void) +{ + unsigned long step = msecs_to_jiffies(rq_avg_period_ms); + u64 jif = get_jiffies_64(); + + do_div(jif, step); + return (jif + 1) * step; +} + +static void rq_avg_timer_func(unsigned long not_used) +{ + update_running_avg(true); + mod_timer(&rq_avg_timer, round_to_nw_start()); +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(struct cpu_data *f, unsigned int need_cpus) +{ + return min(max(f->min_cpus, need_cpus), f->max_cpus); +} + +static bool eval_need(struct cpu_data *f) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + s64 now; + + if (unlikely(!f->inited)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + thres_idx = f->online_cpus ? f->online_cpus - 1 : 0; + list_for_each_entry(c, &f->lru, sib) { + if (c->busy >= f->busy_up_thres[thres_idx]) + c->is_busy = true; + else if (c->busy < f->busy_down_thres[thres_idx]) + c->is_busy = false; + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(f, need_cpus); + need_flag = apply_limits(f, need_cpus) != apply_limits(f, f->need_cpus); + last_need = f->need_cpus; + + now = ktime_to_ms(ktime_get()); + + if (need_cpus == last_need) { + f->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return 0; + } + + if (need_cpus > last_need) { + ret = 1; + } else if (need_cpus < last_need) { + s64 elapsed = now - f->need_ts; + + if (elapsed >= f->offline_delay_ms) { + ret = 1; + } else { + mod_timer(&f->timer, jiffies + + msecs_to_jiffies(f->offline_delay_ms)); + } + } + + if (ret) { + f->need_ts = now; + f->need_cpus = need_cpus; + } + + trace_core_ctl_eval_need(f->cpu, last_need, need_cpus, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cpu_data *f) +{ + if (eval_need(f)) + wake_up_hotplug_thread(f); +} + +static int core_ctl_set_busy(unsigned int cpu, unsigned int busy) +{ + struct cpu_data *c = &per_cpu(cpu_state, cpu); + struct cpu_data *f; + unsigned int old_is_busy = c->is_busy; + + if (!c->inited) + return 0; + f = &per_cpu(cpu_state, c->first_cpu); + + update_running_avg(false); + if (c->busy == busy && !f->nrrun_changed) + return 0; + c->busy = busy; + f->nrrun_changed = false; + + apply_need(f); + trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy); + return 0; +} + +/* ========================= core count enforcement ==================== */ + +/* + * If current thread is hotplug thread, don't attempt to wake up + * itself or other hotplug threads because it will deadlock. Instead, + * schedule a timer to fire in next timer tick and wake up the thread. + */ +static void wake_up_hotplug_thread(struct cpu_data *state) +{ + unsigned long flags; + int cpu; + struct cpu_data *pcpu; + bool no_wakeup = false; + + for_each_possible_cpu(cpu) { + pcpu = &per_cpu(cpu_state, cpu); + if (cpu != pcpu->first_cpu) + continue; + if (pcpu->hotplug_thread == current) { + no_wakeup = true; + break; + } + } + + spin_lock_irqsave(&state->pending_lock, flags); + state->pending = true; + spin_unlock_irqrestore(&state->pending_lock, flags); + + if (no_wakeup) { + spin_lock_irqsave(&state_lock, flags); + mod_timer(&state->timer, jiffies); + spin_unlock_irqrestore(&state_lock, flags); + } else { + wake_up_process(state->hotplug_thread); + } +} + +static void core_ctl_timer_func(unsigned long cpu) +{ + struct cpu_data *state = &per_cpu(cpu_state, cpu); + unsigned long flags; + + if (eval_need(state)) { + spin_lock_irqsave(&state->pending_lock, flags); + state->pending = true; + spin_unlock_irqrestore(&state->pending_lock, flags); + wake_up_process(state->hotplug_thread); + } + +} + +static int core_ctl_online_core(unsigned int cpu) +{ + int ret; + struct device *dev; + + lock_device_hotplug(); + dev = get_cpu_device(cpu); + if (!dev) { + pr_err("%s: failed to get cpu%d device\n", __func__, cpu); + ret = -ENODEV; + } else { + ret = device_online(dev); + } + unlock_device_hotplug(); + return ret; +} + +static int core_ctl_offline_core(unsigned int cpu) +{ + int ret; + struct device *dev; + + lock_device_hotplug(); + dev = get_cpu_device(cpu); + if (!dev) { + pr_err("%s: failed to get cpu%d device\n", __func__, cpu); + ret = -ENODEV; + } else { + ret = device_offline(dev); + } + unlock_device_hotplug(); + return ret; +} + +static void __ref do_hotplug(struct cpu_data *f) +{ + unsigned int need; + struct cpu_data *c, *tmp; + + need = apply_limits(f, f->need_cpus); + pr_debug("Trying to adjust group %u to %u\n", f->first_cpu, need); + + if (f->online_cpus > need) { + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (!c->online) + continue; + + if (f->online_cpus == need) + break; + + /* Don't offline busy CPUs. */ + if (c->is_busy) + continue; + + pr_debug("Trying to Offline CPU%u\n", c->cpu); + if (core_ctl_offline_core(c->cpu)) + pr_debug("Unable to Offline CPU%u\n", c->cpu); + } + + /* + * If the number of online CPUs is within the limits, then + * don't force any busy CPUs offline. + */ + if (f->online_cpus <= f->max_cpus) + return; + + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (!c->online) + continue; + + if (f->online_cpus <= f->max_cpus) + break; + + pr_debug("Trying to Offline CPU%u\n", c->cpu); + if (core_ctl_offline_core(c->cpu)) + pr_debug("Unable to Offline CPU%u\n", c->cpu); + } + } else if (f->online_cpus < need) { + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (c->online || c->rejected || c->not_preferred) + continue; + if (f->online_cpus == need) + break; + + pr_debug("Trying to Online CPU%u\n", c->cpu); + if (core_ctl_online_core(c->cpu)) + pr_debug("Unable to Online CPU%u\n", c->cpu); + } + + if (f->online_cpus == need) + return; + + + list_for_each_entry_safe(c, tmp, &f->lru, sib) { + if (c->online || c->rejected || !c->not_preferred) + continue; + if (f->online_cpus == need) + break; + + pr_debug("Trying to Online CPU%u\n", c->cpu); + if (core_ctl_online_core(c->cpu)) + pr_debug("Unable to Online CPU%u\n", c->cpu); + } + + } +} + +static int __ref try_hotplug(void *data) +{ + struct cpu_data *f = data; + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&f->pending_lock, flags); + if (!f->pending) { + spin_unlock_irqrestore(&f->pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&f->pending_lock, flags); + } + set_current_state(TASK_RUNNING); + f->pending = false; + spin_unlock_irqrestore(&f->pending_lock, flags); + + do_hotplug(f); + } + + return 0; +} + +static int __ref cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + uint32_t cpu = (uintptr_t)hcpu; + struct cpu_data *state = &per_cpu(cpu_state, cpu); + struct cpu_data *f; + int ret = NOTIFY_OK; + unsigned long flags; + + /* Don't affect suspend resume */ + if (action & CPU_TASKS_FROZEN) + return NOTIFY_OK; + + if (unlikely(!state->inited)) + return NOTIFY_OK; + + f = &per_cpu(cpu_state, state->first_cpu); + + switch (action) { + case CPU_UP_PREPARE: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (state->online) { + f->online_cpus--; + state->online = false; + pr_warn("CPU%d offline when state is online\n", cpu); + } + + if (state->rejected) { + state->rejected = false; + f->avail_cpus++; + } + + /* + * If a CPU is in the process of coming up, mark it as online + * so that there's no race with hotplug thread bringing up more + * CPUs than necessary. + */ + if (apply_limits(f, f->need_cpus) <= f->online_cpus) { + pr_debug("Prevent CPU%d onlining\n", cpu); + ret = NOTIFY_BAD; + } else { + state->online = true; + f->online_cpus++; + } + break; + + case CPU_ONLINE: + /* + * Moving to the end of the list should only happen in + * CPU_ONLINE and not on CPU_UP_PREPARE to prevent an + * infinite list traversal when thermal (or other entities) + * reject trying to online CPUs. + */ + spin_lock_irqsave(&state_lock, flags); + list_del(&state->sib); + list_add_tail(&state->sib, &f->lru); + spin_unlock_irqrestore(&state_lock, flags); + break; + + case CPU_DEAD: + /* Move a CPU to the end of the LRU when it goes offline. */ + spin_lock_irqsave(&state_lock, flags); + list_del(&state->sib); + list_add_tail(&state->sib, &f->lru); + spin_unlock_irqrestore(&state_lock, flags); + + /* Fall through */ + + case CPU_UP_CANCELED: + + /* If online state of CPU somehow got out of sync, fix it. */ + if (!state->online) { + f->online_cpus++; + pr_warn("CPU%d online when state is offline\n", cpu); + } + + if (!state->rejected && action == CPU_UP_CANCELED) { + state->rejected = true; + f->avail_cpus--; + } + + state->online = false; + state->busy = 0; + f->online_cpus--; + break; + } + + if (f->online_cpus < apply_limits(f, f->need_cpus) + && f->online_cpus < f->avail_cpus + && action == CPU_DEAD) + wake_up_hotplug_thread(f); + + return ret; +} + +static struct notifier_block __refdata cpu_notifier = { + .notifier_call = cpu_callback, +}; + +/* ============================ init code ============================== */ + +static int group_init(struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cpu_data *f = &per_cpu(cpu_state, first_cpu); + struct cpu_data *state; + unsigned int cpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + + if (likely(f->inited)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + f->num_cpus = cpumask_weight(mask); + if (f->num_cpus > MAX_CPUS_PER_GROUP) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + f->min_cpus = 1; + f->max_cpus = f->num_cpus; + f->need_cpus = f->num_cpus; + f->avail_cpus = f->num_cpus; + f->offline_delay_ms = 100; + f->task_thres = UINT_MAX; + f->nrrun = f->num_cpus; + INIT_LIST_HEAD(&f->lru); + init_timer(&f->timer); + spin_lock_init(&f->pending_lock); + f->timer.function = core_ctl_timer_func; + f->timer.data = first_cpu; + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cpu = cpu; + state->first_cpu = first_cpu; + + if (cpu_online(cpu)) { + f->online_cpus++; + state->online = true; + } + + list_add_tail(&state->sib, &f->lru); + } + + f->hotplug_thread = kthread_run(try_hotplug, (void *) f, + "core_ctl/%d", first_cpu); + sched_setscheduler_nocheck(f->hotplug_thread, SCHED_FIFO, ¶m); + + for_each_cpu(cpu, mask) { + state = &per_cpu(cpu_state, cpu); + state->inited = true; + } + + kobject_init(&f->kobj, &ktype_core_ctl); + return kobject_add(&f->kobj, &dev->kobj, "core_ctl"); +} + +static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + + switch (val) { + case CPUFREQ_CREATE_POLICY: + group_init(policy->related_cpus); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_pol_nb = { + .notifier_call = cpufreq_policy_cb, +}; + +static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_govinfo *info = data; + + switch (val) { + case CPUFREQ_LOAD_CHANGE: + core_ctl_set_busy(info->cpu, info->load); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpufreq_gov_nb = { + .notifier_call = cpufreq_gov_cb, +}; + +static int __init core_ctl_init(void) +{ + struct cpufreq_policy *policy; + unsigned int cpu; + + register_cpu_notifier(&cpu_notifier); + cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER); + cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER); + init_timer_deferrable(&rq_avg_timer); + rq_avg_timer.function = rq_avg_timer_func; + + get_online_cpus(); + for_each_online_cpu(cpu) { + policy = cpufreq_cpu_get(cpu); + if (policy) { + group_init(policy->related_cpus); + cpufreq_cpu_put(policy); + } + } + put_online_cpus(); + mod_timer(&rq_avg_timer, round_to_nw_start()); + return 0; +} + +late_initcall(core_ctl_init);