168 lines
5.6 KiB
Diff
168 lines
5.6 KiB
Diff
From 2cceda3c699f19f9c2f287614db2fe5dd009f73a Mon Sep 17 00:00:00 2001
|
|
From: Sultan Alsawaf <sultan@kerneltoast.com>
|
|
Date: Sun, 19 Apr 2020 19:59:18 -0700
|
|
Subject: ZEN: mm: Stop kswapd early when nothing's waiting for it to free
|
|
pages
|
|
|
|
Contains:
|
|
- mm: Stop kswapd early when nothing's waiting for it to free pages
|
|
|
|
Keeping kswapd running when all the failed allocations that invoked it
|
|
are satisfied incurs a high overhead due to unnecessary page eviction
|
|
and writeback, as well as spurious VM pressure events to various
|
|
registered shrinkers. When kswapd doesn't need to work to make an
|
|
allocation succeed anymore, stop it prematurely to save resources.
|
|
|
|
Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
|
|
|
|
- mm: Don't stop kswapd on a per-node basis when there are no waiters
|
|
|
|
The page allocator wakes all kswapds in an allocation context's allowed
|
|
nodemask in the slow path, so it doesn't make sense to have the kswapd-
|
|
waiter count per each NUMA node. Instead, it should be a global counter
|
|
to stop all kswapds when there are no failed allocation requests.
|
|
|
|
Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
|
|
|
|
- mm: Increment kswapd_waiters for throttled direct reclaimers
|
|
|
|
Throttled direct reclaimers will wake up kswapd and wait for kswapd to
|
|
satisfy their page allocation request, even when the failed allocation
|
|
lacks the __GFP_KSWAPD_RECLAIM flag in its gfp mask. As a result, kswapd
|
|
may think that there are no waiters and thus exit prematurely, causing
|
|
throttled direct reclaimers lacking __GFP_KSWAPD_RECLAIM to stall on
|
|
waiting for kswapd to wake them up. Incrementing the kswapd_waiters
|
|
counter when such direct reclaimers become throttled fixes the problem.
|
|
|
|
Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
|
|
---
|
|
mm/internal.h | 1 +
|
|
mm/page_alloc.c | 17 ++++++++++++++---
|
|
mm/vmscan.c | 19 +++++++++++++------
|
|
3 files changed, 28 insertions(+), 9 deletions(-)
|
|
|
|
--- a/mm/internal.h
|
|
+++ b/mm/internal.h
|
|
@@ -739,6 +739,7 @@ extern void post_alloc_hook(struct page
|
|
extern bool free_pages_prepare(struct page *page, unsigned int order);
|
|
|
|
extern int user_min_free_kbytes;
|
|
+extern atomic_long_t kswapd_waiters;
|
|
|
|
void free_unref_page(struct page *page, unsigned int order);
|
|
void free_unref_folios(struct folio_batch *fbatch);
|
|
--- a/mm/page_alloc.c
|
|
+++ b/mm/page_alloc.c
|
|
@@ -88,6 +88,8 @@ typedef int __bitwise fpi_t;
|
|
*/
|
|
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
|
|
|
|
+atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0);
|
|
+
|
|
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
|
|
static DEFINE_MUTEX(pcp_batch_high_lock);
|
|
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
|
|
@@ -4218,6 +4220,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
|
|
unsigned int cpuset_mems_cookie;
|
|
unsigned int zonelist_iter_cookie;
|
|
int reserve_flags;
|
|
+ bool woke_kswapd = false;
|
|
|
|
if (unlikely(nofail)) {
|
|
/*
|
|
@@ -4276,8 +4279,13 @@ restart:
|
|
goto nopage;
|
|
}
|
|
|
|
- if (alloc_flags & ALLOC_KSWAPD)
|
|
+ if (alloc_flags & ALLOC_KSWAPD) {
|
|
+ if (!woke_kswapd) {
|
|
+ atomic_long_inc(&kswapd_waiters);
|
|
+ woke_kswapd = true;
|
|
+ }
|
|
wake_all_kswapds(order, gfp_mask, ac);
|
|
+ }
|
|
|
|
/*
|
|
* The adjusted alloc_flags might result in immediate success, so try
|
|
@@ -4479,9 +4487,12 @@ nopage:
|
|
goto retry;
|
|
}
|
|
fail:
|
|
- warn_alloc(gfp_mask, ac->nodemask,
|
|
- "page allocation failure: order:%u", order);
|
|
got_pg:
|
|
+ if (woke_kswapd)
|
|
+ atomic_long_dec(&kswapd_waiters);
|
|
+ if (!page)
|
|
+ warn_alloc(gfp_mask, ac->nodemask,
|
|
+ "page allocation failure: order:%u", order);
|
|
return page;
|
|
}
|
|
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -6346,7 +6346,7 @@ retry:
|
|
return 0;
|
|
}
|
|
|
|
-static bool allow_direct_reclaim(pg_data_t *pgdat)
|
|
+static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd)
|
|
{
|
|
struct zone *zone;
|
|
unsigned long pfmemalloc_reserve = 0;
|
|
@@ -6375,6 +6375,10 @@ static bool allow_direct_reclaim(pg_data
|
|
|
|
wmark_ok = free_pages > pfmemalloc_reserve / 2;
|
|
|
|
+ /* The throttled direct reclaimer is now a kswapd waiter */
|
|
+ if (unlikely(!using_kswapd && !wmark_ok))
|
|
+ atomic_long_inc(&kswapd_waiters);
|
|
+
|
|
/* kswapd must be awake if processes are being throttled */
|
|
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
|
|
if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
|
|
@@ -6440,7 +6444,7 @@ static bool throttle_direct_reclaim(gfp_
|
|
|
|
/* Throttle based on the first usable node */
|
|
pgdat = zone->zone_pgdat;
|
|
- if (allow_direct_reclaim(pgdat))
|
|
+ if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM))
|
|
goto out;
|
|
break;
|
|
}
|
|
@@ -6462,11 +6466,14 @@ static bool throttle_direct_reclaim(gfp_
|
|
*/
|
|
if (!(gfp_mask & __GFP_FS))
|
|
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
|
|
- allow_direct_reclaim(pgdat), HZ);
|
|
+ allow_direct_reclaim(pgdat, true), HZ);
|
|
else
|
|
/* Throttle until kswapd wakes the process */
|
|
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
|
|
- allow_direct_reclaim(pgdat));
|
|
+ allow_direct_reclaim(pgdat, true));
|
|
+
|
|
+ if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM)))
|
|
+ atomic_long_dec(&kswapd_waiters);
|
|
|
|
if (fatal_signal_pending(current))
|
|
return true;
|
|
@@ -6969,14 +6976,14 @@ restart:
|
|
* able to safely make forward progress. Wake them
|
|
*/
|
|
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
|
|
- allow_direct_reclaim(pgdat))
|
|
+ allow_direct_reclaim(pgdat, true))
|
|
wake_up_all(&pgdat->pfmemalloc_wait);
|
|
|
|
/* Check if kswapd should be suspending */
|
|
__fs_reclaim_release(_THIS_IP_);
|
|
ret = kthread_freezable_should_stop(&was_frozen);
|
|
__fs_reclaim_acquire(_THIS_IP_);
|
|
- if (was_frozen || ret)
|
|
+ if (was_frozen || ret || !atomic_long_read(&kswapd_waiters))
|
|
break;
|
|
|
|
/*
|