From 2cceda3c699f19f9c2f287614db2fe5dd009f73a Mon Sep 17 00:00:00 2001
From: Sultan Alsawaf <sultan@kerneltoast.com>
Date: Sun, 19 Apr 2020 19:59:18 -0700
Subject: ZEN: mm: Stop kswapd early when nothing's waiting for it to free
 pages

Contains:
  - mm: Stop kswapd early when nothing's waiting for it to free pages

    Keeping kswapd running when all the failed allocations that invoked it
    are satisfied incurs a high overhead due to unnecessary page eviction
    and writeback, as well as spurious VM pressure events to various
    registered shrinkers. When kswapd doesn't need to work to make an
    allocation succeed anymore, stop it prematurely to save resources.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

  - mm: Don't stop kswapd on a per-node basis when there are no waiters

    The page allocator wakes all kswapds in an allocation context's allowed
    nodemask in the slow path, so it doesn't make sense to have the kswapd-
    waiter count per each NUMA node. Instead, it should be a global counter
    to stop all kswapds when there are no failed allocation requests.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

  - mm: Increment kswapd_waiters for throttled direct reclaimers

    Throttled direct reclaimers will wake up kswapd and wait for kswapd to
    satisfy their page allocation request, even when the failed allocation
    lacks the __GFP_KSWAPD_RECLAIM flag in its gfp mask. As a result, kswapd
    may think that there are no waiters and thus exit prematurely, causing
    throttled direct reclaimers lacking __GFP_KSWAPD_RECLAIM to stall on
    waiting for kswapd to wake them up. Incrementing the kswapd_waiters
    counter when such direct reclaimers become throttled fixes the problem.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
---
 mm/internal.h   |  1 +
 mm/page_alloc.c | 17 ++++++++++++++---
 mm/vmscan.c     | 19 +++++++++++++------
 3 files changed, 28 insertions(+), 9 deletions(-)

--- a/mm/internal.h
+++ b/mm/internal.h
@@ -739,6 +739,7 @@ extern void post_alloc_hook(struct page
 extern bool free_pages_prepare(struct page *page, unsigned int order);
 
 extern int user_min_free_kbytes;
+extern atomic_long_t kswapd_waiters;
 
 void free_unref_page(struct page *page, unsigned int order);
 void free_unref_folios(struct folio_batch *fbatch);
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,8 @@ typedef int __bitwise fpi_t;
  */
 #define FPI_TO_TAIL		((__force fpi_t)BIT(1))
 
+atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0);
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -4218,6 +4220,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, u
 	unsigned int cpuset_mems_cookie;
 	unsigned int zonelist_iter_cookie;
 	int reserve_flags;
+	bool woke_kswapd = false;
 
 	if (unlikely(nofail)) {
 		/*
@@ -4276,8 +4279,13 @@ restart:
 			goto nopage;
 	}
 
-	if (alloc_flags & ALLOC_KSWAPD)
+	if (alloc_flags & ALLOC_KSWAPD) {
+		if (!woke_kswapd) {
+			atomic_long_inc(&kswapd_waiters);
+			woke_kswapd = true;
+		}
 		wake_all_kswapds(order, gfp_mask, ac);
+	}
 
 	/*
 	 * The adjusted alloc_flags might result in immediate success, so try
@@ -4479,9 +4487,12 @@ nopage:
 		goto retry;
 	}
 fail:
-	warn_alloc(gfp_mask, ac->nodemask,
-			"page allocation failure: order:%u", order);
 got_pg:
+	if (woke_kswapd)
+		atomic_long_dec(&kswapd_waiters);
+	if (!page)
+		warn_alloc(gfp_mask, ac->nodemask,
+				"page allocation failure: order:%u", order);
 	return page;
 }
 
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6346,7 +6346,7 @@ retry:
 	return 0;
 }
 
-static bool allow_direct_reclaim(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd)
 {
 	struct zone *zone;
 	unsigned long pfmemalloc_reserve = 0;
@@ -6375,6 +6375,10 @@ static bool allow_direct_reclaim(pg_data
 
 	wmark_ok = free_pages > pfmemalloc_reserve / 2;
 
+	/* The throttled direct reclaimer is now a kswapd waiter */
+	if (unlikely(!using_kswapd && !wmark_ok))
+		atomic_long_inc(&kswapd_waiters);
+
 	/* kswapd must be awake if processes are being throttled */
 	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
 		if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
@@ -6440,7 +6444,7 @@ static bool throttle_direct_reclaim(gfp_
 
 		/* Throttle based on the first usable node */
 		pgdat = zone->zone_pgdat;
-		if (allow_direct_reclaim(pgdat))
+		if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM))
 			goto out;
 		break;
 	}
@@ -6462,11 +6466,14 @@ static bool throttle_direct_reclaim(gfp_
 	 */
 	if (!(gfp_mask & __GFP_FS))
 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-			allow_direct_reclaim(pgdat), HZ);
+			allow_direct_reclaim(pgdat, true), HZ);
 	else
 		/* Throttle until kswapd wakes the process */
 		wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-			allow_direct_reclaim(pgdat));
+			allow_direct_reclaim(pgdat, true));
+
+	if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM)))
+		atomic_long_dec(&kswapd_waiters);
 
 	if (fatal_signal_pending(current))
 		return true;
@@ -6969,14 +6976,14 @@ restart:
 		 * able to safely make forward progress. Wake them
 		 */
 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-				allow_direct_reclaim(pgdat))
+				allow_direct_reclaim(pgdat, true))
 			wake_up_all(&pgdat->pfmemalloc_wait);
 
 		/* Check if kswapd should be suspending */
 		__fs_reclaim_release(_THIS_IP_);
 		ret = kthread_freezable_should_stop(&was_frozen);
 		__fs_reclaim_acquire(_THIS_IP_);
-		if (was_frozen || ret)
+		if (was_frozen || ret || !atomic_long_read(&kswapd_waiters))
 			break;
 
 		/*