[PATCH 37/49] mm/sparse-vmemmap: unify DAX and HugeTLB vmemmap optimization

Sun Apr 5 22:52:28 AEST 2026

The ultimate goal of the recent refactoring series is to unify the vmemmap
optimization logic for both DAX and HugeTLB under a common framework
(CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION).

A key breakthrough in this unification is that DAX now only requires 1
vmemmap page to be preserved (the head page), aligning its requirements
exactly with HugeTLB. Previously, DAX optimization relied on a dedicated
upper-level function, vmemmap_populate_compound_pages, which handled the
manual allocation of the head page AND the first tail page before reusing
the shared tail page for the rest.

Because DAX and HugeTLB are now perfectly aligned in their optimization
requirements (1 reserved page + reused shared tail pages), this patch
eliminates the dedicated compound page mapping loop entirely. Instead, it
pushes the optimization decision down to the lowest level in
vmemmap_pte_populate. Now, all mapping requests flow through the standard
vmemmap_populate_basepages.

Signed-off-by: Muchun Song <songmuchun at bytedance.com>
---
 arch/powerpc/mm/book3s64/radix_pgtable.c |  13 +-
 include/linux/mm.h                       |   2 +-
 mm/mm_init.c                             |   2 +-
 mm/sparse-vmemmap.c                      | 185 +++++------------------
 4 files changed, 40 insertions(+), 162 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 5ce3deb464d5..714d5cdc10ec 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1326,17 +1326,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 					return -ENOMEM;
 				vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 
-				/*
-				 * Populate the tail pages vmemmap page
-				 * It can fall in different pmd, hence
-				 * vmemmap_populate_address()
-				 */
-				pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
-				if (!pte)
-					return -ENOMEM;
-
-				addr_pfn += 2;
-				next = addr + 2 * PAGE_SIZE;
+				addr_pfn += 1;
+				next = addr + PAGE_SIZE;
 				continue;
 			}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 15841829b7eb..bceef0dc578b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4912,7 +4912,7 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
 }
 #endif
 
-#define VMEMMAP_RESERVE_NR	2
+#define VMEMMAP_RESERVE_NR	OPTIMIZED_FOLIO_VMEMMAP_PAGES
 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
 static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
 					  struct dev_pagemap *pgmap)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 636a0f9644f6..6b23b5f02544 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1066,7 +1066,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
  * initialize is a lot smaller that the total amount of struct pages being
  * mapped. This is a paired / mild layering violation with explicit knowledge
  * of how the sparse_vmemmap internals handle compound pages in the lack
- * of an altmap. See vmemmap_populate_compound_pages().
+ * of an altmap.
  */
 static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
 					      struct dev_pagemap *pgmap,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 1867b5dcc73c..fd7b0e1e5aba 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -152,46 +152,40 @@ static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, in
 					      struct vmem_altmap *altmap,
 					      unsigned long ptpfn)
 {
-	pte_t *pte = pte_offset_kernel(pmd, addr);
-
-	if (pte_none(ptep_get(pte))) {
-		pte_t entry;
-
-		if (vmemmap_page_optimizable((struct page *)addr) &&
-		    ptpfn == (unsigned long)-1) {
-			struct page *page;
-			unsigned long pfn = page_to_pfn((struct page *)addr);
-			const struct mem_section *ms = __pfn_to_section(pfn);
-
-			page = vmemmap_shared_tail_page(section_order(ms),
-							section_to_zone(ms, node));
-			if (!page)
-				return NULL;
-			ptpfn = page_to_pfn(page);
-		}
+	pte_t entry, *pte = pte_offset_kernel(pmd, addr);
 
-		if (ptpfn == (unsigned long)-1) {
-			void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
-
-			if (!p)
-				return NULL;
-			ptpfn = PHYS_PFN(__pa(p));
-		} else {
-			/*
-			 * When a PTE/PMD entry is freed from the init_mm
-			 * there's a free_pages() call to this page allocated
-			 * above. Thus this get_page() is paired with the
-			 * put_page_testzero() on the freeing path.
-			 * This can only called by certain ZONE_DEVICE path,
-			 * and through vmemmap_populate_compound_pages() when
-			 * slab is available.
-			 */
-			if (slab_is_available())
-				get_page(pfn_to_page(ptpfn));
-		}
-		entry = pfn_pte(ptpfn, PAGE_KERNEL);
-		set_pte_at(&init_mm, addr, pte, entry);
+	if (!pte_none(ptep_get(pte)))
+		return pte;
+
+	/* See layout diagram in Documentation/mm/vmemmap_dedup.rst. */
+	if (vmemmap_page_optimizable((struct page *)addr)) {
+		struct page *page;
+		unsigned long pfn = page_to_pfn((struct page *)addr);
+		const struct mem_section *ms = __pfn_to_section(pfn);
+
+		page = vmemmap_shared_tail_page(section_order(ms),
+						section_to_zone(ms, node));
+		if (!page)
+			return NULL;
+
+		/*
+		 * When a PTE entry is freed, a free_pages() call occurs. This
+		 * get_page() pairs with put_page_testzero() on the freeing
+		 * path. This can only occur when slab is available.
+		 */
+		if (slab_is_available())
+			get_page(page);
+		ptpfn = page_to_pfn(page);
+	} else {
+		void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+
+		if (!p)
+			return NULL;
+		ptpfn = PHYS_PFN(__pa(p));
 	}
+	entry = pfn_pte(ptpfn, PAGE_KERNEL);
+	set_pte_at(&init_mm, addr, pte, entry);
+
 	return pte;
 }
 
@@ -287,17 +281,15 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
 	return pte;
 }
 
-static int __meminit vmemmap_populate_range(unsigned long start,
-					    unsigned long end, int node,
-					    struct vmem_altmap *altmap,
-					    unsigned long ptpfn)
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+					 int node, struct vmem_altmap *altmap,
+					 struct dev_pagemap *pgmap)
 {
 	unsigned long addr = start;
 	pte_t *pte;
 
 	for (; addr < end; addr += PAGE_SIZE) {
-		pte = vmemmap_populate_address(addr, node, altmap,
-					       ptpfn);
+		pte = vmemmap_populate_address(addr, node, altmap, -1);
 		if (!pte)
 			return -ENOMEM;
 	}
@@ -305,19 +297,6 @@ static int __meminit vmemmap_populate_range(unsigned long start,
 	return 0;
 }
 
-static int __meminit vmemmap_populate_compound_pages(unsigned long start,
-						     unsigned long end, int node,
-						     struct dev_pagemap *pgmap);
-
-int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
-					 int node, struct vmem_altmap *altmap,
-					 struct dev_pagemap *pgmap)
-{
-	if (vmemmap_can_optimize(altmap, pgmap))
-		return vmemmap_populate_compound_pages(start, end, node, pgmap);
-	return vmemmap_populate_range(start, end, node, altmap, -1);
-}
-
 /*
  * Write protect the mirrored tail page structs for HVO. This will be
  * called from the hugetlb code when gathering and initializing the
@@ -397,9 +376,6 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
 	pud_t *pud;
 	pmd_t *pmd;
 
-	if (vmemmap_can_optimize(altmap, pgmap))
-		return vmemmap_populate_compound_pages(start, end, node, pgmap);
-
 	for (addr = start; addr < end; addr = next) {
 		unsigned long pfn = page_to_pfn((struct page *)addr);
 		const struct mem_section *ms = __pfn_to_section(pfn);
@@ -447,95 +423,6 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
 	return 0;
 }
 
-/*
- * For compound pages bigger than section size (e.g. x86 1G compound
- * pages with 2M subsection size) fill the rest of sections as tail
- * pages.
- *
- * Note that memremap_pages() resets @nr_range value and will increment
- * it after each range successful onlining. Thus the value or @nr_range
- * at section memmap populate corresponds to the in-progress range
- * being onlined here.
- */
-static bool __meminit reuse_compound_section(unsigned long start_pfn,
-					     struct dev_pagemap *pgmap)
-{
-	unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
-	unsigned long offset = start_pfn -
-		PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
-
-	return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
-}
-
-static int __meminit vmemmap_populate_compound_pages(unsigned long start,
-						     unsigned long end, int node,
-						     struct dev_pagemap *pgmap)
-{
-	unsigned long size, addr;
-	pte_t *pte;
-	int rc;
-	unsigned long start_pfn = page_to_pfn((struct page *)start);
-	const struct mem_section *ms = __pfn_to_section(start_pfn);
-	struct page *tail;
-
-	/* This may occur in sub-section scenarios. */
-	if (!section_vmemmap_optimizable(ms))
-		return vmemmap_populate_range(start, end, node, NULL, -1);
-
-	tail = vmemmap_shared_tail_page(section_order(ms),
-					section_to_zone(ms, node));
-	if (!tail)
-		return -ENOMEM;
-
-	if (reuse_compound_section(start_pfn, pgmap))
-		return vmemmap_populate_range(start, end, node, NULL,
-					      page_to_pfn(tail));
-
-	size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
-	for (addr = start; addr < end; addr += size) {
-		unsigned long next, last = addr + size;
-		void *p;
-
-		/* Populate the head page vmemmap page */
-		pte = vmemmap_populate_address(addr, node, NULL, -1);
-		if (!pte)
-			return -ENOMEM;
-
-		/*
-		 * Allocate manually since vmemmap_populate_address() will assume DAX
-		 * only needs 1 vmemmap page to be reserved, however DAX now needs 2
-		 * vmemmap pages. This is a temporary solution and will be unified
-		 * with HugeTLB in the future.
-		 */
-		p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
-		if (!p)
-			return -ENOMEM;
-
-		/* Populate the tail pages vmemmap page */
-		next = addr + PAGE_SIZE;
-		pte = vmemmap_populate_address(next, node, NULL, PHYS_PFN(__pa(p)));
-		/*
-		 * get_page() is called above. Since we are not actually
-		 * reusing it, to avoid a memory leak, we call put_page() here.
-		 */
-		put_page(virt_to_page(p));
-		if (!pte)
-			return -ENOMEM;
-
-		/*
-		 * Reuse the shared vmemmap page for the rest of tail pages
-		 * See layout diagram in Documentation/mm/vmemmap_dedup.rst
-		 */
-		next += PAGE_SIZE;
-		rc = vmemmap_populate_range(next, last, node, NULL,
-					    page_to_pfn(tail));
-		if (rc)
-			return -ENOMEM;
-	}
-
-	return 0;
-}
-
 struct page * __meminit __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
 		struct dev_pagemap *pgmap)
-- 
2.20.1