sched/cass: Introduce the Capacity Aware Superset Scheduler

The Capacity Aware Superset Scheduler (CASS) optimizes runqueue selection of CFS tasks. By using CPU capacity as a basis for comparing the relative utilization between different CPUs, CASS fairly balances load across CPUs of varying capacities. This results in improved multi-core performance, especially when CPUs are overutilized because CASS doesn't clip a CPU's utilization when it eclipses the CPU's capacity. As a superset of capacity aware scheduling, CASS implements a hierarchy of criteria to determine the better CPU to wake a task upon between CPUs that have the same relative utilization. This way, single-core performance, latency, and cache affinity are all optimized where possible. CASS doesn't feature explicit energy awareness but its basic load balancing principle results in decreased overall energy, often better than what is possible with explicit energy awareness. By fairly balancing load based on relative utilization, all CPUs are kept at their lowest P-state necessary to satisfy the overall load at any given moment. This version of CASS is adjusted to work on older kernels. Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com> Signed-off-by: clarencelol <clarencekuiek@icloud.com>
1 year ago · 34d1c95b90
parent 2e33cbe1c4
commit 34d1c95b90
3 changed files with 239 additions and 0 deletions
--- a/init/Kconfig
+++ b/init/Kconfig
@ -1086,6 +1086,31 @@ config NET_NS

 endif # NAMESPACES

+config SCHED_CASS
+	bool "Capacity Aware Superset Scheduler"
+	depends on SMP
+	help
+	  This enables the Capacity Aware Superset Scheduler (CASS), which
+	  optimizes runqueue selection of CFS tasks. By using CPU capacity as a
+	  basis for comparing the relative utilization between different CPUs,
+	  CASS fairly balances load across CPUs of varying capacities. This
+	  results in improved multi-core performance, especially when CPUs are
+	  overutilized because CASS doesn't clip a CPU's utilization when it
+	  eclipses the CPU's capacity.
+
+	  As a superset of capacity aware scheduling, CASS implements a
+	  hierarchy of criteria to determine the better CPU to wake a task upon
+	  between CPUs that have the same relative utilization. This way,
+	  single-core performance, latency, and cache affinity are all optimized
+	  where possible.
+
+	  CASS doesn't feature explicit energy awareness but its basic load
+	  balancing principle results in decreased overall energy, often better
+	  than what is possible with explicit energy awareness. By fairly
+	  balancing load based on relative utilization, all CPUs are kept at
+	  their lowest P-state necessary to satisfy the overall load at any
+	  given moment.
+
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
 	select CGROUPS
--- a/kernel/sched/cass.c
+++ b/kernel/sched/cass.c
@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 Sultan Alsawaf <sultan@kerneltoast.com>.
+ */
+
+/**
+ * DOC: Capacity Aware Superset Scheduler (CASS) description
+ *
+ * The Capacity Aware Superset Scheduler (CASS) optimizes runqueue selection of
+ * CFS tasks. By using CPU capacity as a basis for comparing the relative
+ * utilization between different CPUs, CASS fairly balances load across CPUs of
+ * varying capacities. This results in improved multi-core performance,
+ * especially when CPUs are overutilized because CASS doesn't clip a CPU's
+ * utilization when it eclipses the CPU's capacity.
+ *
+ * As a superset of capacity aware scheduling, CASS implements a hierarchy of
+ * criteria to determine the better CPU to wake a task upon between CPUs that
+ * have the same relative utilization. This way, single-core performance,
+ * latency, and cache affinity are all optimized where possible.
+ *
+ * CASS doesn't feature explicit energy awareness but its basic load balancing
+ * principle results in decreased overall energy, often better than what is
+ * possible with explicit energy awareness. By fairly balancing load based on
+ * relative utilization, all CPUs are kept at their lowest P-state necessary to
+ * satisfy the overall load at any given moment.
+ */
+
+struct cass_cpu_cand {
+	int cpu;
+	unsigned int exit_lat;
+	unsigned long cap;
+	unsigned long util;
+};
+
+static __always_inline
+unsigned long cass_cpu_util(int cpu, bool sync)
+{
+	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+
+	/* Deduct @current's util from this CPU if this is a sync wake */
+	if (sync && cpu == smp_processor_id())
+		sub_positive(&util, task_util(current));
+
+	if (sched_feat(UTIL_EST))
+		util = max_t(unsigned long, util,
+			     READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+	return util;
+}
+
+/* Returns true if @a is a better CPU than @b */
+static __always_inline
+bool cass_cpu_better(const struct cass_cpu_cand *a,
+		     const struct cass_cpu_cand *b,
+		     int prev_cpu, bool sync)
+{
+#define cass_cmp(a, b) ({ res = (a) - (b); })
+#define cass_eq(a, b) ({ res = (a) == (b); })
+	long res;
+
+	/* Prefer the CPU with lower relative utilization */
+	if (cass_cmp(b->util, a->util))
+		goto done;
+
+	/* Prefer the current CPU for sync wakes */
+	if (sync && (cass_eq(a->cpu, smp_processor_id()) ||
+		     !cass_cmp(b->cpu, smp_processor_id())))
+		goto done;
+
+	/* Prefer the CPU with higher capacity */
+	if (cass_cmp(a->cap, b->cap))
+		goto done;
+
+	/* Prefer the CPU with lower idle exit latency */
+	if (cass_cmp(b->exit_lat, a->exit_lat))
+		goto done;
+
+	/* Prefer the previous CPU */
+	if (cass_eq(a->cpu, prev_cpu) || !cass_cmp(b->cpu, prev_cpu))
+		goto done;
+
+	/* Prefer the CPU that shares a cache with the previous CPU */
+	if (cass_cmp(cpus_share_cache(a->cpu, prev_cpu),
+		     cpus_share_cache(b->cpu, prev_cpu)))
+		goto done;
+
+	/* @a isn't a better CPU than @b. @res must be <=0 to indicate such. */
+done:
+	/* @a is a better CPU than @b if @res is positive */
+	return res > 0;
+}
+
+static int cass_best_cpu(struct task_struct *p, int prev_cpu, bool sync)
+{
+	/* Initialize @best such that @best always has a valid CPU at the end */
+	struct cass_cpu_cand cands[2], *best = cands, *curr;
+	struct cpuidle_state *idle_state;
+	bool has_idle = false;
+	unsigned long p_util;
+	int cidx = 0, cpu;
+
+	/* Get the utilization for this task */
+	p_util = task_util_est(p);
+
+	/*
+	 * Find the best CPU to wake @p on. The RCU read lock is needed for
+	 * idle_get_state().
+	 */
+	rcu_read_lock();
+	for_each_cpu_and(cpu, &p->cpus_allowed, cpu_active_mask) {
+		/* Use the free candidate slot */
+		curr = &cands[cidx];
+		curr->cpu = cpu;
+
+		/*
+		 * Check if this CPU is idle. For sync wakes, always treat the
+		 * current CPU as idle.
+		 */
+		if ((sync && cpu == smp_processor_id()) || idle_cpu(cpu)) {
+			/* Discard any previous non-idle candidate */
+			if (!has_idle) {
+				best = curr;
+				cidx ^= 1;
+			}
+			has_idle = true;
+
+			/* Nonzero exit latency indicates this CPU is idle */
+			curr->exit_lat = 1;
+
+			/* Add on the actual idle exit latency, if any */
+			idle_state = idle_get_state(cpu_rq(cpu));
+			if (idle_state)
+				curr->exit_lat += idle_state->exit_latency;
+		} else {
+			/* Skip non-idle CPUs if there's an idle candidate */
+			if (has_idle)
+				continue;
+
+			/* Zero exit latency indicates this CPU isn't idle */
+			curr->exit_lat = 0;
+		}
+
+		/* Get this CPU's utilization, possibly without @current */
+		curr->util = cass_cpu_util(cpu, sync);
+
+		/*
+		 * Add @p's utilization to this CPU if it's not @p's CPU, to
+		 * find what this CPU's relative utilization would look like
+		 * if @p were on it.
+		 */
+		if (cpu != task_cpu(p))
+			curr->util += p_util;
+
+		/*
+		 * Get the current capacity of this CPU adjusted for thermal
+		 * pressure as well as IRQ and RT-task time.
+		 */
+		curr->cap = capacity_of(cpu);
+
+		/* Calculate the relative utilization for this CPU candidate */
+		curr->util = curr->util * SCHED_CAPACITY_SCALE / curr->cap;
+
+		/* If @best == @curr then there's no need to compare them */
+		if (best == curr)
+			continue;
+
+		/* Check if this CPU is better than the best CPU found */
+		if (cass_cpu_better(curr, best, prev_cpu, sync)) {
+			best = curr;
+			cidx ^= 1;
+		}
+	}
+	rcu_read_unlock();
+
+	return best->cpu;
+}
+
+static int cass_select_task_rq_fair(struct task_struct *p, int prev_cpu,
+				    int sd_flag, int wake_flags,
+				    int sibling_count_hint)
+{
+	bool sync;
+
+	/* Don't balance on exec since we don't know what @p will look like */
+	if (sd_flag & SD_BALANCE_EXEC)
+		return prev_cpu;
+
+	/*
+	 * If there aren't any valid CPUs which are active, then just return the
+	 * first valid CPU since it's possible for certain types of tasks to run
+	 * on inactive CPUs.
+	 */
+	if (unlikely(!cpumask_intersects(&p->cpus_allowed, cpu_active_mask)))
+		return cpumask_first(&p->cpus_allowed);
+
+	/* cass_best_cpu() needs the task's utilization, so sync it up */
+	if (!(sd_flag & SD_BALANCE_FORK))
+		sync_entity_load_avg(&p->se);
+
+	sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+	return cass_best_cpu(p, prev_cpu, sync);
+}
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -12703,6 +12703,17 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 	return rr_interval;
 }

+#ifdef CONFIG_SCHED_CASS
+#include "cass.c"
+
+/* Use CASS. A dummy wrapper ensures the replaced function is still "used". */
+static inline void *select_task_rq_fair_dummy(void)
+{
+	return (void *)select_task_rq_fair;
+}
+#define select_task_rq_fair cass_select_task_rq_fair
+#endif /* CONFIG_SCHED_CASS */
+
 /*
 * All the scheduling class methods:
 */