mm: vmpressure: Fix rampant inaccuracies caused by stale data usage

After a period of intense memory pressure is over, it's common for
vmpressure to still have old reclaim efficiency data accumulated from
this time. When memory pressure starts to rise again, this stale data
will factor into vmpressure's calculations, and can cause vmpressure to
report an erroneously high pressure. The reverse is possible, too:
vmpressure may report pressures that are erroneously low due to stale
data that's been stored.

Furthermore, since kswapd can still be performing reclaim when there are
no failed memory allocations stuck in the page allocator's slow path,
vmpressure may still report pressures when there aren't any memory
allocations to satisfy. This can cause last-resort memory reclaimers to
kill processes to free memory when it's not needed.

To fix the rampant stale data, keep track of when there are processes
utilizing reclaim in the page allocator's slow path, and reset the
accumulated data in vmpressure when a new period of elevated memory
pressure begins. Extra measures are taken for the kswapd issue mentioned
above by ignoring all reclaim efficiency data reported by kswapd when
there aren't any failed memory allocations in the page allocator which
utilize reclaim.

Note that since sr_lock can now be used from IRQ context, IRQs must be
disabled whenever sr_lock is used to prevent deadlocks.

Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
fourteen
Sultan Alsawaf 4 years ago committed by Jenna
parent faf978cab1
commit 26276a19c5
  1. 5
      include/linux/vmpressure.h
  2. 7
      mm/page_alloc.c
  3. 74
      mm/vmpressure.c

@ -27,6 +27,9 @@ struct vmpressure {
struct mutex events_lock;
struct work_struct work;
atomic_long_t users;
rwlock_t users_lock;
};
struct mem_cgroup;
@ -38,6 +41,8 @@ extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
int order);
extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio,
int order);
extern bool vmpressure_inc_users(int order);
extern void vmpressure_dec_users(void);
#ifdef CONFIG_MEMCG
extern void vmpressure_init(struct vmpressure *vmpr);

@ -4090,6 +4090,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
task_cputime(current, &utime, &stime_s);
bool woke_kswapd = false;
bool used_vmpressure = false;
/*
* We also sanity check to catch abuse of atomic reserves being used by
@ -4128,6 +4129,8 @@ retry_cpuset:
atomic_long_inc(&kswapd_waiters);
woke_kswapd = true;
}
if (!used_vmpressure)
used_vmpressure = vmpressure_inc_users(order);
wake_all_kswapds(order, ac);
}
@ -4222,6 +4225,8 @@ retry:
goto nopage;
/* Try direct reclaim and then allocating */
if (!used_vmpressure)
used_vmpressure = vmpressure_inc_users(order);
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
pages_reclaimed += did_some_progress;
@ -4357,6 +4362,8 @@ got_pg:
}
if (woke_kswapd)
atomic_long_dec(&kswapd_waiters);
if (used_vmpressure)
vmpressure_dec_users();
if (!page)
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);

@ -217,11 +217,12 @@ static void vmpressure_work_fn(struct work_struct *work)
unsigned long scanned;
unsigned long reclaimed;
unsigned long pressure;
unsigned long flags;
enum vmpressure_levels level;
bool ancestor = false;
bool signalled = false;
spin_lock(&vmpr->sr_lock);
spin_lock_irqsave(&vmpr->sr_lock, flags);
/*
* Several contexts might be calling vmpressure(), so it is
* possible that the work was rescheduled again before the old
@ -232,14 +233,14 @@ static void vmpressure_work_fn(struct work_struct *work)
*/
scanned = vmpr->tree_scanned;
if (!scanned) {
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);
return;
}
reclaimed = vmpr->tree_reclaimed;
vmpr->tree_scanned = 0;
vmpr->tree_reclaimed = 0;
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);
pressure = vmpressure_calc_pressure(scanned, reclaimed);
level = vmpressure_level(pressure);
@ -280,6 +281,7 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
unsigned long flags;
/*
* If we got here with no pages scanned, then that is an indicator
@ -295,10 +297,10 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
return;
if (tree) {
spin_lock(&vmpr->sr_lock);
spin_lock_irqsave(&vmpr->sr_lock, flags);
scanned = vmpr->tree_scanned += scanned;
vmpr->tree_reclaimed += reclaimed;
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);
if (!critical && scanned < calculate_vmpressure_win())
return;
@ -311,15 +313,15 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
if (!memcg || memcg == root_mem_cgroup)
return;
spin_lock(&vmpr->sr_lock);
spin_lock_irqsave(&vmpr->sr_lock, flags);
scanned = vmpr->scanned += scanned;
reclaimed = vmpr->reclaimed += reclaimed;
if (!critical && scanned < calculate_vmpressure_win()) {
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);
return;
}
vmpr->scanned = vmpr->reclaimed = 0;
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);
pressure = vmpressure_calc_pressure(scanned, reclaimed);
level = vmpressure_level(pressure);
@ -343,17 +345,49 @@ static void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
unsigned long reclaimed) { }
#endif
bool vmpressure_inc_users(int order)
{
struct vmpressure *vmpr = &global_vmpressure;
unsigned long flags;
if (order > PAGE_ALLOC_COSTLY_ORDER)
return false;
write_lock_irqsave(&vmpr->users_lock, flags);
if (atomic_long_inc_return_relaxed(&vmpr->users) == 1) {
/* Clear out stale vmpressure data when reclaim begins */
spin_lock(&vmpr->sr_lock);
vmpr->scanned = 0;
vmpr->reclaimed = 0;
vmpr->stall = 0;
spin_unlock(&vmpr->sr_lock);
}
write_unlock_irqrestore(&vmpr->users_lock, flags);
return true;
}
void vmpressure_dec_users(void)
{
struct vmpressure *vmpr = &global_vmpressure;
/* Decrement the vmpressure user count with release semantics */
smp_mb__before_atomic();
atomic_long_dec(&vmpr->users);
}
static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical,
unsigned long reclaimed)
{
struct vmpressure *vmpr = &global_vmpressure;
unsigned long pressure;
unsigned long stall;
unsigned long flags;
if (critical)
scanned = calculate_vmpressure_win();
spin_lock(&vmpr->sr_lock);
spin_lock_irqsave(&vmpr->sr_lock, flags);
if (scanned) {
vmpr->scanned += scanned;
vmpr->reclaimed += reclaimed;
@ -366,14 +400,14 @@ static void vmpressure_global(gfp_t gfp, unsigned long scanned, bool critical,
reclaimed = vmpr->reclaimed;
if (!critical && scanned < calculate_vmpressure_win()) {
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);
return;
}
}
vmpr->scanned = 0;
vmpr->reclaimed = 0;
vmpr->stall = 0;
spin_unlock(&vmpr->sr_lock);
spin_unlock_irqrestore(&vmpr->sr_lock, flags);
if (scanned) {
pressure = vmpressure_calc_pressure(scanned, reclaimed);
@ -419,9 +453,25 @@ static void __vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool critical,
void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed, int order)
{
struct vmpressure *vmpr = &global_vmpressure;
unsigned long flags;
if (order > PAGE_ALLOC_COSTLY_ORDER)
return;
/*
* It's possible for kswapd to keep doing reclaim even though memory
* pressure isn't high anymore. We should only track vmpressure when
* there are failed memory allocations actively stuck in the page
* allocator's slow path. No failed allocations means pressure is fine.
*/
read_lock_irqsave(&vmpr->users_lock, flags);
if (!atomic_long_read(&vmpr->users)) {
read_unlock_irqrestore(&vmpr->users_lock, flags);
return;
}
read_unlock_irqrestore(&vmpr->users_lock, flags);
__vmpressure(gfp, memcg, false, tree, scanned, reclaimed);
}
@ -590,6 +640,8 @@ void vmpressure_init(struct vmpressure *vmpr)
mutex_init(&vmpr->events_lock);
INIT_LIST_HEAD(&vmpr->events);
INIT_WORK(&vmpr->work, vmpressure_work_fn);
atomic_long_set(&vmpr->users, 0);
rwlock_init(&vmpr->users_lock);
}
/**

Loading…
Cancel
Save