279 lines
9.9 KiB
Diff
279 lines
9.9 KiB
Diff
From ed7c5f96ef6426a46c255706667dde67063110cb Mon Sep 17 00:00:00 2001
|
|
From: Harry Yoo <harry.yoo@oracle.com>
|
|
Date: Mon, 18 Aug 2025 11:02:05 +0900
|
|
Subject: mm: introduce and use {pgd,p4d}_populate_kernel()
|
|
|
|
Introduce and use {pgd,p4d}_populate_kernel() in core MM code when
|
|
populating PGD and P4D entries for the kernel address space. These
|
|
helpers ensure proper synchronization of page tables when updating the
|
|
kernel portion of top-level page tables.
|
|
|
|
Until now, the kernel has relied on each architecture to handle
|
|
synchronization of top-level page tables in an ad-hoc manner. For
|
|
example, see commit 9b861528a801 ("x86-64, mem: Update all PGDs for direct
|
|
mapping and vmemmap mapping changes").
|
|
|
|
However, this approach has proven fragile for following reasons:
|
|
|
|
1) It is easy to forget to perform the necessary page table
|
|
synchronization when introducing new changes.
|
|
For instance, commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory
|
|
savings for compound devmaps") overlooked the need to synchronize
|
|
page tables for the vmemmap area.
|
|
|
|
2) It is also easy to overlook that the vmemmap and direct mapping areas
|
|
must not be accessed before explicit page table synchronization.
|
|
For example, commit 8d400913c231 ("x86/vmemmap: handle unpopulated
|
|
sub-pmd ranges")) caused crashes by accessing the vmemmap area
|
|
before calling sync_global_pgds().
|
|
|
|
To address this, as suggested by Dave Hansen, introduce _kernel() variants
|
|
of the page table population helpers, which invoke architecture-specific
|
|
hooks to properly synchronize page tables. These are introduced in a new
|
|
header file, include/linux/pgalloc.h, so they can be called from common
|
|
code.
|
|
|
|
They reuse existing infrastructure for vmalloc and ioremap.
|
|
Synchronization requirements are determined by ARCH_PAGE_TABLE_SYNC_MASK,
|
|
and the actual synchronization is performed by
|
|
arch_sync_kernel_mappings().
|
|
|
|
This change currently targets only x86_64, so only PGD and P4D level
|
|
helpers are introduced. Currently, these helpers are no-ops since no
|
|
architecture sets PGTBL_{PGD,P4D}_MODIFIED in ARCH_PAGE_TABLE_SYNC_MASK.
|
|
|
|
In theory, PUD and PMD level helpers can be added later if needed by other
|
|
architectures. For now, 32-bit architectures (x86-32 and arm) only handle
|
|
PGTBL_PMD_MODIFIED, so p*d_populate_kernel() will never affect them unless
|
|
we introduce a PMD level helper.
|
|
|
|
[harry.yoo@oracle.com: fix KASAN build error due to p*d_populate_kernel()]
|
|
Link: https://lkml.kernel.org/r/20250822020727.202749-1-harry.yoo@oracle.com
|
|
Link: https://lkml.kernel.org/r/20250818020206.4517-3-harry.yoo@oracle.com
|
|
Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges")
|
|
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
|
|
Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
|
|
Acked-by: Kiryl Shutsemau <kas@kernel.org>
|
|
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
|
|
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
|
|
Acked-by: David Hildenbrand <david@redhat.com>
|
|
Cc: Alexander Potapenko <glider@google.com>
|
|
Cc: Alistair Popple <apopple@nvidia.com>
|
|
Cc: Andrey Konovalov <andreyknvl@gmail.com>
|
|
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
|
|
Cc: Andy Lutomirski <luto@kernel.org>
|
|
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
|
|
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
|
|
Cc: Ard Biesheuvel <ardb@kernel.org>
|
|
Cc: Arnd Bergmann <arnd@arndb.de>
|
|
Cc: bibo mao <maobibo@loongson.cn>
|
|
Cc: Borislav Betkov <bp@alien8.de>
|
|
Cc: Christoph Lameter (Ampere) <cl@gentwo.org>
|
|
Cc: Dennis Zhou <dennis@kernel.org>
|
|
Cc: Dev Jain <dev.jain@arm.com>
|
|
Cc: Dmitriy Vyukov <dvyukov@google.com>
|
|
Cc: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
|
|
Cc: Ingo Molnar <mingo@redhat.com>
|
|
Cc: Jane Chu <jane.chu@oracle.com>
|
|
Cc: Joao Martins <joao.m.martins@oracle.com>
|
|
Cc: Joerg Roedel <joro@8bytes.org>
|
|
Cc: John Hubbard <jhubbard@nvidia.com>
|
|
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
|
|
Cc: Liam Howlett <liam.howlett@oracle.com>
|
|
Cc: Michal Hocko <mhocko@suse.com>
|
|
Cc: Oscar Salvador <osalvador@suse.de>
|
|
Cc: Peter Xu <peterx@redhat.com>
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
|
Cc: Ryan Roberts <ryan.roberts@arm.com>
|
|
Cc: Suren Baghdasaryan <surenb@google.com>
|
|
Cc: Tejun Heo <tj@kernel.org>
|
|
Cc: Thomas Gleinxer <tglx@linutronix.de>
|
|
Cc: Thomas Huth <thuth@redhat.com>
|
|
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
|
|
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
|
|
Cc: Vlastimil Babka <vbabka@suse.cz>
|
|
Cc: <stable@vger.kernel.org>
|
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
|
---
|
|
include/linux/pgalloc.h | 29 +++++++++++++++++++++++++++++
|
|
include/linux/pgtable.h | 13 +++++++------
|
|
mm/kasan/init.c | 12 ++++++------
|
|
mm/percpu.c | 6 +++---
|
|
mm/sparse-vmemmap.c | 6 +++---
|
|
5 files changed, 48 insertions(+), 18 deletions(-)
|
|
create mode 100644 include/linux/pgalloc.h
|
|
|
|
--- /dev/null
|
|
+++ b/include/linux/pgalloc.h
|
|
@@ -0,0 +1,29 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _LINUX_PGALLOC_H
|
|
+#define _LINUX_PGALLOC_H
|
|
+
|
|
+#include <linux/pgtable.h>
|
|
+#include <asm/pgalloc.h>
|
|
+
|
|
+/*
|
|
+ * {pgd,p4d}_populate_kernel() are defined as macros to allow
|
|
+ * compile-time optimization based on the configured page table levels.
|
|
+ * Without this, linking may fail because callers (e.g., KASAN) may rely
|
|
+ * on calls to these functions being optimized away when passing symbols
|
|
+ * that exist only for certain page table levels.
|
|
+ */
|
|
+#define pgd_populate_kernel(addr, pgd, p4d) \
|
|
+ do { \
|
|
+ pgd_populate(&init_mm, pgd, p4d); \
|
|
+ if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PGD_MODIFIED) \
|
|
+ arch_sync_kernel_mappings(addr, addr); \
|
|
+ } while (0)
|
|
+
|
|
+#define p4d_populate_kernel(addr, p4d, pud) \
|
|
+ do { \
|
|
+ p4d_populate(&init_mm, p4d, pud); \
|
|
+ if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_P4D_MODIFIED) \
|
|
+ arch_sync_kernel_mappings(addr, addr); \
|
|
+ } while (0)
|
|
+
|
|
+#endif /* _LINUX_PGALLOC_H */
|
|
--- a/include/linux/pgtable.h
|
|
+++ b/include/linux/pgtable.h
|
|
@@ -1332,8 +1332,8 @@ static inline void ptep_modify_prot_comm
|
|
|
|
/*
|
|
* Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values
|
|
- * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings()
|
|
- * needs to be called.
|
|
+ * and let generic vmalloc, ioremap and page table update code know when
|
|
+ * arch_sync_kernel_mappings() needs to be called.
|
|
*/
|
|
#ifndef ARCH_PAGE_TABLE_SYNC_MASK
|
|
#define ARCH_PAGE_TABLE_SYNC_MASK 0
|
|
@@ -1832,10 +1832,11 @@ static inline bool arch_has_pfn_modify_c
|
|
/*
|
|
* Page Table Modification bits for pgtbl_mod_mask.
|
|
*
|
|
- * These are used by the p?d_alloc_track*() set of functions an in the generic
|
|
- * vmalloc/ioremap code to track at which page-table levels entries have been
|
|
- * modified. Based on that the code can better decide when vmalloc and ioremap
|
|
- * mapping changes need to be synchronized to other page-tables in the system.
|
|
+ * These are used by the p?d_alloc_track*() and p*d_populate_kernel()
|
|
+ * functions in the generic vmalloc, ioremap and page table update code
|
|
+ * to track at which page-table levels entries have been modified.
|
|
+ * Based on that the code can better decide when page table changes need
|
|
+ * to be synchronized to other page-tables in the system.
|
|
*/
|
|
#define __PGTBL_PGD_MODIFIED 0
|
|
#define __PGTBL_P4D_MODIFIED 1
|
|
--- a/mm/kasan/init.c
|
|
+++ b/mm/kasan/init.c
|
|
@@ -13,9 +13,9 @@
|
|
#include <linux/mm.h>
|
|
#include <linux/pfn.h>
|
|
#include <linux/slab.h>
|
|
+#include <linux/pgalloc.h>
|
|
|
|
#include <asm/page.h>
|
|
-#include <asm/pgalloc.h>
|
|
|
|
#include "kasan.h"
|
|
|
|
@@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
|
|
- p4d_populate(&init_mm, p4d,
|
|
+ p4d_populate_kernel(addr, p4d,
|
|
lm_alias(kasan_early_shadow_pud));
|
|
pud = pud_offset(p4d, addr);
|
|
pud_populate(&init_mm, pud,
|
|
@@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t
|
|
} else {
|
|
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
|
|
pud_init(p);
|
|
- p4d_populate(&init_mm, p4d, p);
|
|
+ p4d_populate_kernel(addr, p4d, p);
|
|
}
|
|
}
|
|
zero_pud_populate(p4d, addr, next);
|
|
@@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(co
|
|
* puds,pmds, so pgd_populate(), pud_populate()
|
|
* is noops.
|
|
*/
|
|
- pgd_populate(&init_mm, pgd,
|
|
+ pgd_populate_kernel(addr, pgd,
|
|
lm_alias(kasan_early_shadow_p4d));
|
|
p4d = p4d_offset(pgd, addr);
|
|
- p4d_populate(&init_mm, p4d,
|
|
+ p4d_populate_kernel(addr, p4d,
|
|
lm_alias(kasan_early_shadow_pud));
|
|
pud = pud_offset(p4d, addr);
|
|
pud_populate(&init_mm, pud,
|
|
@@ -273,7 +273,7 @@ int __ref kasan_populate_early_shadow(co
|
|
if (!p)
|
|
return -ENOMEM;
|
|
} else {
|
|
- pgd_populate(&init_mm, pgd,
|
|
+ pgd_populate_kernel(addr, pgd,
|
|
early_alloc(PAGE_SIZE, NUMA_NO_NODE));
|
|
}
|
|
}
|
|
--- a/mm/percpu.c
|
|
+++ b/mm/percpu.c
|
|
@@ -3108,7 +3108,7 @@ out_free:
|
|
#endif /* BUILD_EMBED_FIRST_CHUNK */
|
|
|
|
#ifdef BUILD_PAGE_FIRST_CHUNK
|
|
-#include <asm/pgalloc.h>
|
|
+#include <linux/pgalloc.h>
|
|
|
|
#ifndef P4D_TABLE_SIZE
|
|
#define P4D_TABLE_SIZE PAGE_SIZE
|
|
@@ -3134,13 +3134,13 @@ void __init __weak pcpu_populate_pte(uns
|
|
|
|
if (pgd_none(*pgd)) {
|
|
p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
|
|
- pgd_populate(&init_mm, pgd, p4d);
|
|
+ pgd_populate_kernel(addr, pgd, p4d);
|
|
}
|
|
|
|
p4d = p4d_offset(pgd, addr);
|
|
if (p4d_none(*p4d)) {
|
|
pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
|
|
- p4d_populate(&init_mm, p4d, pud);
|
|
+ p4d_populate_kernel(addr, p4d, pud);
|
|
}
|
|
|
|
pud = pud_offset(p4d, addr);
|
|
--- a/mm/sparse-vmemmap.c
|
|
+++ b/mm/sparse-vmemmap.c
|
|
@@ -27,9 +27,9 @@
|
|
#include <linux/spinlock.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/sched.h>
|
|
+#include <linux/pgalloc.h>
|
|
|
|
#include <asm/dma.h>
|
|
-#include <asm/pgalloc.h>
|
|
#include <asm/tlbflush.h>
|
|
|
|
#include "hugetlb_vmemmap.h"
|
|
@@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(p
|
|
if (!p)
|
|
return NULL;
|
|
pud_init(p);
|
|
- p4d_populate(&init_mm, p4d, p);
|
|
+ p4d_populate_kernel(addr, p4d, p);
|
|
}
|
|
return p4d;
|
|
}
|
|
@@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(u
|
|
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
|
|
if (!p)
|
|
return NULL;
|
|
- pgd_populate(&init_mm, pgd, p);
|
|
+ pgd_populate_kernel(addr, pgd, p);
|
|
}
|
|
return pgd;
|
|
}
|