From 26276a19c5a7d5e61a1da14624fc8929124eb3fa Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Fri, 9 Jul 2021 22:20:42 -0700 Subject: [PATCH] mm: vmpressure: Fix rampant inaccuracies caused by stale data usage After a period of intense memory pressure is over, it's common for vmpressure to still have old reclaim efficiency data accumulated from this time. When memory pressure starts to rise again, this stale data will factor into vmpressure's calculations, and can cause vmpressure to report an erroneously high pressure. The reverse is possible, too: vmpressure may report pressures that are erroneously low due to stale data that's been stored. Furthermore, since kswapd can still be performing reclaim when there are no failed memory allocations stuck in the page allocator's slow path, vmpressure may still report pressures when there aren't any memory allocations to satisfy. This can cause last-resort memory reclaimers to kill processes to free memory when it's not needed. To fix the rampant stale data, keep track of when there are processes utilizing reclaim in the page allocator's slow path, and reset the accumulated data in vmpressure when a new period of elevated memory pressure begins. Extra measures are taken for the kswapd issue mentioned above by ignoring all reclaim efficiency data reported by kswapd when there aren't any failed memory allocations in the page allocator which utilize reclaim. Note that since sr_lock can now be used from IRQ context, IRQs must be disabled whenever sr_lock is used to prevent deadlocks. Signed-off-by: Sultan Alsawaf --- include/linux/vmpressure.h | 5 +++ mm/page_alloc.c | 7 ++++ mm/vmpressure.c | 74 ++++++++++++++++++++++++++++++++------ 3 files changed, 75 insertions(+), 11 deletions(-) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 6503dafb8e7c..f9af04345897 100755 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -27,6 +27,9 @@ struct vmpressure { struct mutex events_lock; struct work_struct work; + + atomic_long_t users; + rwlock_t users_lock; }; struct mem_cgroup; @@ -38,6 +41,8 @@ extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, int order); extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio, int order); +extern bool vmpressure_inc_users(int order); +extern void vmpressure_dec_users(void); #ifdef CONFIG_MEMCG extern void vmpressure_init(struct vmpressure *vmpr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 51de840d394d..564d0d66986b 100755 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4090,6 +4090,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, task_cputime(current, &utime, &stime_s); bool woke_kswapd = false; + bool used_vmpressure = false; /* * We also sanity check to catch abuse of atomic reserves being used by @@ -4128,6 +4129,8 @@ retry_cpuset: atomic_long_inc(&kswapd_waiters); woke_kswapd = true; } + if (!used_vmpressure) + used_vmpressure = vmpressure_inc_users(order); wake_all_kswapds(order, ac); } @@ -4222,6 +4225,8 @@ retry: goto nopage; /* Try direct reclaim and then allocating */ + if (!used_vmpressure) + used_vmpressure = vmpressure_inc_users(order); page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); pages_reclaimed += did_some_progress; @@ -4357,6 +4362,8 @@ got_pg: } if (woke_kswapd) atomic_long_dec(&kswapd_waiters); + if (used_vmpressure) + vmpressure_dec_users(); if (!page) warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 50f4134e65cf..8d6eeacaf3ad 100755 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -217,11 +217,12 @@ static void vmpressure_work_fn(struct work_struct *work) unsigned long scanned; unsigned long reclaimed; unsigned long pressure; + unsigned long flags; enum vmpressure_levels level; bool ancestor = false; bool signalled = false; - spin_lock(&vmpr->sr_lock); + spin_lock_irqsave(&vmpr->sr_lock, flags); /* * Several contexts might be calling vmpressure(), so it is * possible that the work was rescheduled again before the old @@ -232,14 +233,14 @@ static void vmpressure_work_fn(struct work_struct *work) */ scanned = vmpr->tree_scanned; if (!scanned) { - spin_unlock(&vmpr->sr_lock); + spin_unlock_irqrestore(&vmpr->sr_lock, flags); return; } reclaimed = vmpr->tree_reclaimed; vmpr->tree_scanned = 0; vmpr->tree_reclaimed = 0; - spin_unlock(&vmpr->sr_lock); + spin_unlock_irqrestore(&vmpr->sr_lock, flags); pressure = vmpressure_calc_pressure(scanned, reclaimed); level = vmpressure_level(pressure); @@ -280,6 +281,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical, unsigned long reclaimed) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + unsigned long flags; /* * If we got here with no pages scanned, then that is an indicator @@ -295,10 +297,10 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical, return; if (tree) { - spin_lock(&vmpr->sr_lock); + spin_lock_irqsave(&vmpr->sr_lock, flags); scanned = vmpr->tree_scanned += scanned; vmpr->tree_reclaimed += reclaimed; - spin_unlock(&vmpr->sr_lock); + spin_unlock_irqrestore(&vmpr->sr_lock, flags); if (!critical && scanned < calculate_vmpressure_win()) return; @@ -311,15 +313,15 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical, if (!memcg || memcg == root_mem_cgroup) return; - spin_lock(&vmpr->sr_lock); + spin_lock_irqsave(&vmpr->sr_lock, flags); scanned = vmpr->scanned += scanned; reclaimed = vmpr->reclaimed += reclaimed; if (!critical && scanned < calculate_vmpressure_win()) { - spin_unlock(&vmpr->sr_lock); + spin_unlock_irqrestore(&vmpr->sr_lock, flags); return; } vmpr->scanned = vmpr->reclaimed = 0; - spin_unlock(&vmpr->sr_lock); + spin_unlock_irqrestore(&vmpr->sr_lock, flags); pressure = vmpressure_calc_pressure(scanned, reclaimed); level = vmpressure_level(pressure); @@ -343,17 +345,49 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical, unsigned long reclaimed) { } #endif +bool vmpressure_inc_users(int order) +{ + struct vmpressure *vmpr = &global_vmpressure; + unsigned long flags; + + if (order > PAGE_ALLOC_COSTLY_ORDER) + return false; + + write_lock_irqsave(&vmpr->users_lock, flags); + if (atomic_long_inc_return_relaxed(&vmpr->users) == 1) { + /* Clear out stale vmpressure data when reclaim begins */ + spin_lock(&vmpr->sr_lock); + vmpr->scanned = 0; + vmpr->reclaimed = 0; + vmpr->stall = 0; + spin_unlock(&vmpr->sr_lock); + } + write_unlock_irqrestore(&vmpr->users_lock, flags); + + return true; +} + +void vmpressure_dec_users(void) +{ + struct vmpressure *vmpr = &global_vmpressure; + + /* Decrement the vmpressure user count with release semantics */ + smp_mb__before_atomic(); + atomic_long_dec(&vmpr->users); +} + static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical, unsigned long reclaimed) { struct vmpressure *vmpr = &global_vmpressure; unsigned long pressure; unsigned long stall; + unsigned long flags; if (critical) scanned = calculate_vmpressure_win(); - spin_lock(&vmpr->sr_lock); + spin_lock_irqsave(&vmpr->sr_lock, flags); if (scanned) { vmpr->scanned += scanned; vmpr->reclaimed += reclaimed; @@ -366,14 +400,14 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical, reclaimed = vmpr->reclaimed; if (!critical && scanned < calculate_vmpressure_win()) { - spin_unlock(&vmpr->sr_lock); + spin_unlock_irqrestore(&vmpr->sr_lock, flags); return; } } vmpr->scanned = 0; vmpr->reclaimed = 0; vmpr->stall = 0; - spin_unlock(&vmpr->sr_lock); + spin_unlock_irqrestore(&vmpr->sr_lock, flags); if (scanned) { pressure = vmpressure_calc_pressure(scanned, reclaimed); @@ -419,9 +453,25 @@ static void __vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool critical, void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed, int order) { + struct vmpressure *vmpr = &global_vmpressure; + unsigned long flags; + if (order > PAGE_ALLOC_COSTLY_ORDER) return; + /* + * It's possible for kswapd to keep doing reclaim even though memory + * pressure isn't high anymore. We should only track vmpressure when + * there are failed memory allocations actively stuck in the page + * allocator's slow path. No failed allocations means pressure is fine. + */ + read_lock_irqsave(&vmpr->users_lock, flags); + if (!atomic_long_read(&vmpr->users)) { + read_unlock_irqrestore(&vmpr->users_lock, flags); + return; + } + read_unlock_irqrestore(&vmpr->users_lock, flags); + __vmpressure(gfp, memcg, false, tree, scanned, reclaimed); } @@ -590,6 +640,8 @@ void vmpressure_init(struct vmpressure *vmpr) mutex_init(&vmpr->events_lock); INIT_LIST_HEAD(&vmpr->events); INIT_WORK(&vmpr->work, vmpressure_work_fn); + atomic_long_set(&vmpr->users, 0); + rwlock_init(&vmpr->users_lock); } /**