simple_lmk: Reap anonymous memory from victims

The OOM reaper makes it possible to immediately release anonymous memory from a dying process in order to free up memory faster. This provides immediate relief under heavy memory pressure instead of waiting for victim processes to naturally release their memory. Utilize the OOM reaper by creating another kthread in Simple LMK to perform victim reaping. Similar to the OOM reaper kthread (which is unused with Simple LMK), this new kthread allows reaping to race with exit_mmap() in order to preclude the need to take a reference to an mm's address space and thus potentially mmput() an mm's last reference. Doing so would stall the reaper kthread, preventing it from being able to quickly reap new victims. Reaping is done on victims one at a time by descending order of anonymous pages, so that the most promising victims with the most anonymous pages are reaped first. Victims are also marked for reaping via MMF_OOM_VICTIM so that they reap themselves first in exit_mmap(). Even if a victim isn't reaped by the reaper thread, it'll free its anonymous memory first thing in exit_mmap() as a small win towards making memory available sooner. By relieving memory pressure faster via reaping, Simple LMK not only doesn't need to kill as many processes, but also improves system responsiveness when memory is low since memory pressure is relieved sooner. Although not strictly required, Simple LMK should be the only one utilizing the OOM reaper. Any other code that may utilize the OOM reaper, such as patches that invoke the OOM reaper for all SIGKILLs, should be disabled. Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
2 years ago · 26ea3c9934
parent 0787646d32
commit 26ea3c9934
1 changed files with 164 additions and 15 deletions
--- a/drivers/android/simple_lmk.c
+++ b/drivers/android/simple_lmk.c
@ -8,9 +8,11 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 #include <linux/moduleparam.h>
 #include <linux/oom.h>
 #include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
 #include <linux/sort.h>
 #include <linux/vmpressure.h>
 #include <uapi/linux/sched/types.h>
@ -33,10 +35,13 @@ struct victim_info {
 static struct victim_info victims[MAX_VICTIMS] __cacheline_aligned_in_smp;
 static struct task_struct *task_bucket[SHRT_MAX + 1] __cacheline_aligned;
 static DECLARE_WAIT_QUEUE_HEAD(oom_waitq);
+static DECLARE_WAIT_QUEUE_HEAD(reaper_waitq);
 static DECLARE_COMPLETION(reclaim_done);
 static __cacheline_aligned_in_smp DEFINE_RWLOCK(mm_free_lock);
 static int nr_victims;
+static bool reclaim_active;
 static atomic_t needs_reclaim = ATOMIC_INIT(0);
+static atomic_t needs_reap = ATOMIC_INIT(0);
 static atomic_t nr_killed = ATOMIC_INIT(0);

 static int victim_cmp(const void *lhs_ptr, const void *rhs_ptr)
@ -186,11 +191,28 @@ static int process_victims(int vlen)
 	return nr_to_kill;
 }

+static void set_task_rt_prio(struct task_struct *tsk, int priority)
+{
+	const struct sched_param rt_prio = {
+		.sched_priority = priority
+	};
+
+	sched_setscheduler_nocheck(tsk, SCHED_RR, &rt_prio);
+}
+
 static void scan_and_kill(void)
 {
 	int i, nr_to_kill, nr_found = 0;
 	unsigned long pages_found;

+	/*
+	 * Reset nr_victims so the reaper thread and simple_lmk_mm_freed() are
+	 * aware that the victims array is no longer valid.
+	 */
+	write_lock(&mm_free_lock);
+	nr_victims = 0;
+	write_unlock(&mm_free_lock);
+
 	/* Populate the victims array with tasks sorted by adj and then size */
 	pages_found = find_victims(&nr_found);
 	if (unlikely(!nr_found)) {
@ -219,23 +241,28 @@ static void scan_and_kill(void)
 		nr_to_kill = nr_found;
 	}

-	/* Store the final number of victims for simple_lmk_mm_freed() */
+	/*
+	 * Store the final number of victims for simple_lmk_mm_freed() and the
+	 * reaper thread, and indicate that reclaim is active.
+	 */
 	write_lock(&mm_free_lock);
 	nr_victims = nr_to_kill;
+	reclaim_active = true;
 	write_unlock(&mm_free_lock);

 	/* Kill the victims */
 	for (i = 0; i < nr_to_kill; i++) {
-		static const struct sched_param min_rt_prio = {
-			.sched_priority = 1
-		};
 		struct victim_info *victim = &victims[i];
 		struct task_struct *t, *vtsk = victim->tsk;
+		struct mm_struct *mm = victim->mm;

 		pr_info("Killing %s with adj %d to free %lu KiB\n", vtsk->comm,
 			vtsk->signal->oom_score_adj,
 			victim->size << (PAGE_SHIFT - 10));

+		/* Make the victim reap anonymous memory first in exit_mmap() */
+		set_bit(MMF_OOM_VICTIM, &mm->flags);
+
 		/* Accelerate the victim's death by forcing the kill signal */
 		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, vtsk, true);

@ -253,7 +280,7 @@ static void scan_and_kill(void)
 		for_each_thread(vtsk, t)
 			set_tsk_thread_flag(t, TIF_MEMDIE);
 		for_each_thread(vtsk, t)
-			sched_setscheduler_nocheck(t, SCHED_RR, &min_rt_prio);
+			set_task_rt_prio(t, 1);
 		rcu_read_unlock();

 		/* Allow the victim to run on any CPU. This won't schedule. */
@ -262,29 +289,42 @@ static void scan_and_kill(void)
 		/* Signals can't wake frozen tasks; only a thaw operation can */
 		__thaw_task(vtsk);

+		/* Store the number of anon pages to sort victims for reaping */
+		victim->size = get_mm_counter(mm, MM_ANONPAGES);
+
 		/* Finally release the victim's task lock acquired earlier */
 		task_unlock(vtsk);
 	}

+	/*
+	 * Sort the victims by descending order of anonymous pages so the reaper
+	 * thread can prioritize reaping the victims with the most anonymous
+	 * pages first. Then wake the reaper thread if it's asleep. The lock
+	 * orders the needs_reap store before waitqueue_active().
+	 */
+	write_lock(&mm_free_lock);
+	sort(victims, nr_to_kill, sizeof(*victims), victim_cmp, victim_swap);
+	atomic_set(&needs_reap, 1);
+	write_unlock(&mm_free_lock);
+	if (waitqueue_active(&reaper_waitq))
+		wake_up(&reaper_waitq);
+
 	/* Wait until all the victims die or until the timeout is reached */
 	if (!wait_for_completion_timeout(&reclaim_done, RECLAIM_EXPIRES))
 		pr_info("Timeout hit waiting for victims to die, proceeding\n");

-	/* Clean up for future reclaim invocations */
+	/* Clean up for future reclaims but let the reaper thread keep going */
 	write_lock(&mm_free_lock);
 	reinit_completion(&reclaim_done);
-	nr_victims = 0;
+	reclaim_active = false;
 	nr_killed = (atomic_t)ATOMIC_INIT(0);
 	write_unlock(&mm_free_lock);
 }

 static int simple_lmk_reclaim_thread(void *data)
 {
-	static const struct sched_param sched_max_rt_prio = {
-		.sched_priority = MAX_RT_PRIO - 1
-	};
-
-	sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio);
+	/* Use maximum RT priority */
+	set_task_rt_prio(current, MAX_RT_PRIO - 1);
 	set_freezable();

 	while (1) {
@ -296,18 +336,124 @@ static int simple_lmk_reclaim_thread(void *data)
 	return 0;
 }

+static struct mm_struct *next_reap_victim(void)
+{
+	struct mm_struct *mm = NULL;
+	bool should_retry = false;
+	int i;
+
+	/* Take a write lock so no victim's mm can be freed while scanning */
+	write_lock(&mm_free_lock);
+	for (i = 0; i < nr_victims; i++, mm = NULL) {
+		/* Check if this victim is alive and hasn't been reaped yet */
+		mm = victims[i].mm;
+		if (!mm || test_bit(MMF_OOM_SKIP, &mm->flags))
+			continue;
+
+		/* Do a trylock so the reaper thread doesn't sleep */
+		if (!down_read_trylock(&mm->mmap_sem)) {
+			should_retry = true;
+			continue;
+		}
+
+		/* Skip any mm with notifiers for now since they can sleep */
+		if (mm_has_notifiers(mm)) {
+			up_read(&mm->mmap_sem);
+			should_retry = true;
+			continue;
+		}
+
+		/*
+		 * Check MMF_OOM_SKIP again under the lock in case this mm was
+		 * reaped by exit_mmap() and then had its page tables destroyed.
+		 * No mmgrab() is needed because the reclaim thread sets
+		 * MMF_OOM_VICTIM under task_lock() for the mm's task, which
+		 * guarantees that MMF_OOM_VICTIM is always set before the
+		 * victim mm can enter exit_mmap(). Therefore, an mmap read lock
+		 * is sufficient to keep the mm struct itself from being freed.
+		 */
+		if (!test_bit(MMF_OOM_SKIP, &mm->flags))
+			break;
+		up_read(&mm->mmap_sem);
+	}
+
+	if (!mm) {
+		if (should_retry)
+			/* Return ERR_PTR(-EAGAIN) to try reaping again later */
+			mm = ERR_PTR(-EAGAIN);
+		else if (!reclaim_active)
+			/*
+			 * Nothing left to reap, so stop simple_lmk_mm_freed()
+			 * from iterating over the victims array since reclaim
+			 * is no longer active. Return NULL to stop reaping.
+			 */
+			nr_victims = 0;
+	}
+	write_unlock(&mm_free_lock);
+
+	return mm;
+}
+
+static void reap_victims(void)
+{
+	struct mm_struct *mm;
+
+	while ((mm = next_reap_victim())) {
+		if (IS_ERR(mm)) {
+			/* Wait one jiffy before trying to reap again */
+			schedule_timeout_uninterruptible(1);
+			continue;
+		}
+
+		/*
+		 * Reap the victim, then unflag the mm for exit_mmap() reaping
+		 * and mark it as reaped with MMF_OOM_SKIP.
+		 */
+		__oom_reap_task_mm(mm);
+		clear_bit(MMF_OOM_VICTIM, &mm->flags);
+		set_bit(MMF_OOM_SKIP, &mm->flags);
+		up_read(&mm->mmap_sem);
+	}
+}
+
+static int simple_lmk_reaper_thread(void *data)
+{
+	/* Use a lower priority than the reclaim thread */
+	set_task_rt_prio(current, MAX_RT_PRIO - 2);
+	set_freezable();
+
+	while (1) {
+		wait_event_freezable(reaper_waitq,
+				     atomic_cmpxchg_relaxed(&needs_reap, 1, 0));
+		reap_victims();
+	}
+
+	return 0;
+}
+
 void simple_lmk_mm_freed(struct mm_struct *mm)
 {
 	int i;

-	/* Nothing to do when reclaim is starting or ending */
-	if (!read_trylock(&mm_free_lock))
+	/*
+	 * Victims are guaranteed to have MMF_OOM_SKIP set after exit_mmap()
+	 * finishes. Use this to ignore unrelated dying processes.
+	 */
+	if (!test_bit(MMF_OOM_SKIP, &mm->flags))
 		return;

+	read_lock(&mm_free_lock);
 	for (i = 0; i < nr_victims; i++) {
 		if (victims[i].mm == mm) {
+			/*
+			 * Clear out this victim from the victims array and only
+			 * increment nr_killed if reclaim is active. If reclaim
+			 * isn't active, then clearing out the victim is done
+			 * solely for the reaper thread to avoid freed victims.
+			 */
 			victims[i].mm = NULL;
-			if (atomic_inc_return_relaxed(&nr_killed) == nr_victims)
+			if (reclaim_active &&
+			    atomic_inc_return_relaxed(&nr_killed) == nr_victims)
 				complete(&reclaim_done);
 			break;
 		}
@ -340,6 +486,9 @@ static int simple_lmk_init_set(const char *val, const struct kernel_param *kp)
 	struct task_struct *thread;

 	if (!atomic_cmpxchg(&init_done, 0, 1)) {
+		thread = kthread_run(simple_lmk_reaper_thread, NULL,
+				     "simple_lmkd_reaper");
+		BUG_ON(IS_ERR(thread));
 		thread = kthread_run(simple_lmk_reclaim_thread, NULL,
 				     "simple_lmkd");
 		BUG_ON(IS_ERR(thread));