Skip to content

Commit e7a5f24

Browse files
softkauakpm00
authored andcommitted
mm: re-enable kswapd when memory pressure subsides or demotion is toggled
If kswapd fails to reclaim pages from a node MAX_RECLAIM_RETRIES in a row, kswapd on that node gets disabled. That is, the system won't wakeup kswapd for that node until page reclamation is observed at least once. That reclamation is mostly done by direct reclaim, which in turn enables kswapd back. However, on systems with CXL memory nodes, workloads with high anon page usage can disable kswapd indefinitely, without triggering direct reclaim. This can be reproduced with following steps: numa node 0 (32GB memory, 48 CPUs) numa node 2~5 (512GB CXL memory, 128GB each) (numa node 1 is disabled) swap space 8GB 1) Set /sys/kernel/mm/demotion_enabled to 0. 2) Set /proc/sys/kernel/numa_balancing to 0. 3) Run a process that allocates and random accesses 500GB of anon pages. 4) Let the process exit normally. During 3), free memory on node 0 gets lower than low watermark, and kswapd runs and depletes swap space. Then, kswapd fails consecutively and gets disabled. Allocation afterwards happens on CXL memory, so node 0 never gains more memory pressure to trigger direct reclaim. After 4), kswapd on node 0 remains disabled, and tasks running on that node are unable to swap. If you turn on NUMA_BALANCING_MEMORY_TIERING and demotion now, it won't work properly since kswapd is disabled. To mitigate this problem, reset kswapd_failures to 0 on following conditions: a) ZONE_BELOW_HIGH bit of a zone in hopeless node with a fallback memory node gets cleared. b) demotion_enabled is changed from false to true. Rationale for a): ZONE_BELOW_HIGH bit being cleared might be a sign that the node may be reclaimable afterwards. This won't help much if the memory-hungry process keeps running without freeing anything, but at least the node will go back to reclaimable state when the process exits. Rationale for b): When demotion_enabled is false, kswapd can only reclaim anon pages by swapping them out to swap space. If demotion_enabled is turned on, kswapd can demote anon pages to another node for reclaiming. So, the original failure count for determining reclaimability is no longer valid. Since kswapd_failures resets may be missed by ++ operation, it is changed from int to atomic_t. [akpm@linux-foundation.org: tweak whitespace] Link: https://lkml.kernel.org/r/aL6qGi69jWXfPc4D@pcw-MS-7D22 Signed-off-by: Chanwon Park <flyinrm@gmail.com> Cc: Brendan Jackman <jackmanb@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Shakeel Butt <shakeel.butt@linux.dev> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent c563252 commit e7a5f24

File tree

6 files changed

+45
-17
lines changed

6 files changed

+45
-17
lines changed

include/linux/mmzone.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1440,7 +1440,7 @@ typedef struct pglist_data {
14401440
int kswapd_order;
14411441
enum zone_type kswapd_highest_zoneidx;
14421442

1443-
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
1443+
atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */
14441444

14451445
#ifdef CONFIG_COMPACTION
14461446
int kcompactd_max_order;

mm/memory-tiers.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -942,11 +942,23 @@ static ssize_t demotion_enabled_store(struct kobject *kobj,
942942
const char *buf, size_t count)
943943
{
944944
ssize_t ret;
945+
bool before = numa_demotion_enabled;
945946

946947
ret = kstrtobool(buf, &numa_demotion_enabled);
947948
if (ret)
948949
return ret;
949950

951+
/*
952+
* Reset kswapd_failures statistics. They may no longer be
953+
* valid since the policy for kswapd has changed.
954+
*/
955+
if (before == false && numa_demotion_enabled == true) {
956+
struct pglist_data *pgdat;
957+
958+
for_each_online_pgdat(pgdat)
959+
atomic_set(&pgdat->kswapd_failures, 0);
960+
}
961+
950962
return count;
951963
}
952964

mm/page_alloc.c

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2860,14 +2860,29 @@ static void free_frozen_page_commit(struct zone *zone,
28602860
*/
28612861
return;
28622862
}
2863+
28632864
high = nr_pcp_high(pcp, zone, batch, free_high);
2864-
if (pcp->count >= high) {
2865-
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
2866-
pcp, pindex);
2867-
if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
2868-
zone_watermark_ok(zone, 0, high_wmark_pages(zone),
2869-
ZONE_MOVABLE, 0))
2870-
clear_bit(ZONE_BELOW_HIGH, &zone->flags);
2865+
if (pcp->count < high)
2866+
return;
2867+
2868+
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
2869+
pcp, pindex);
2870+
if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
2871+
zone_watermark_ok(zone, 0, high_wmark_pages(zone),
2872+
ZONE_MOVABLE, 0)) {
2873+
struct pglist_data *pgdat = zone->zone_pgdat;
2874+
clear_bit(ZONE_BELOW_HIGH, &zone->flags);
2875+
2876+
/*
2877+
* Assume that memory pressure on this node is gone and may be
2878+
* in a reclaimable state. If a memory fallback node exists,
2879+
* direct reclaim may not have been triggered, causing a
2880+
* 'hopeless node' to stay in that state for a while. Let
2881+
* kswapd work again by resetting kswapd_failures.
2882+
*/
2883+
if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES &&
2884+
next_memory_node(pgdat->node_id) < MAX_NUMNODES)
2885+
atomic_set(&pgdat->kswapd_failures, 0);
28712886
}
28722887
}
28732888

mm/show_mem.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
278278
#endif
279279
K(node_page_state(pgdat, NR_PAGETABLE)),
280280
K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
281-
str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES),
281+
str_yes_no(atomic_read(&pgdat->kswapd_failures) >=
282+
MAX_RECLAIM_RETRIES),
282283
K(node_page_state(pgdat, NR_BALLOON_PAGES)));
283284
}
284285

mm/vmscan.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
518518
* If kswapd is disabled, reschedule if necessary but do not
519519
* throttle as the system is likely near OOM.
520520
*/
521-
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
521+
if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
522522
return true;
523523

524524
/*
@@ -5101,7 +5101,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *
51015101
blk_finish_plug(&plug);
51025102
done:
51035103
if (sc->nr_reclaimed > reclaimed)
5104-
pgdat->kswapd_failures = 0;
5104+
atomic_set(&pgdat->kswapd_failures, 0);
51055105
}
51065106

51075107
/******************************************************************************
@@ -6180,7 +6180,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
61806180
* successful direct reclaim run will revive a dormant kswapd.
61816181
*/
61826182
if (reclaimable)
6183-
pgdat->kswapd_failures = 0;
6183+
atomic_set(&pgdat->kswapd_failures, 0);
61846184
else if (sc->cache_trim_mode)
61856185
sc->cache_trim_mode_failed = 1;
61866186
}
@@ -6492,7 +6492,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
64926492
int i;
64936493
bool wmark_ok;
64946494

6495-
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
6495+
if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
64966496
return true;
64976497

64986498
for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
@@ -6902,7 +6902,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
69026902
wake_up_all(&pgdat->pfmemalloc_wait);
69036903

69046904
/* Hopeless node, leave it to direct reclaim */
6905-
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
6905+
if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
69066906
return true;
69076907

69086908
if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
@@ -7170,7 +7170,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
71707170
}
71717171

71727172
if (!sc.nr_reclaimed)
7173-
pgdat->kswapd_failures++;
7173+
atomic_inc(&pgdat->kswapd_failures);
71747174

71757175
out:
71767176
clear_reclaim_active(pgdat, highest_zoneidx);
@@ -7429,7 +7429,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
74297429
return;
74307430

74317431
/* Hopeless node, leave it to direct reclaim if possible */
7432-
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
7432+
if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES ||
74337433
(pgdat_balanced(pgdat, order, highest_zoneidx) &&
74347434
!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
74357435
/*

mm/vmstat.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1848,7 +1848,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
18481848
seq_printf(m,
18491849
"\n node_unreclaimable: %u"
18501850
"\n start_pfn: %lu",
1851-
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1851+
atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES,
18521852
zone->zone_start_pfn);
18531853
seq_putc(m, '\n');
18541854
}

0 commit comments

Comments
 (0)