[PATCH RFC v1 3/3] powerpc/memtrace: Don't offline memory blocks via offline_pages()

David Hildenbrand david at redhat.com
Tue Dec 17 23:38:51 AEDT 2019


offline_pages() should not be called outside of the MM core. Especially,
having to manually fiddle with the memory block states is a sign that
this is not a good idea. To offline memory block devices cleanly,
device_offline() should be used. This is the only remaining caller of
offline_pages(), except the official device_offline() way.

E.g., when trying to allocate right now we trigger messages like
[   11.227817] page:c00c000000182000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0
[   11.228056] raw: 007ffff000000000 c000000001538860 c000000001538860 0000000000000000
[   11.228070] raw: 0000000000000000 0000000000000001 00000001ffffffff 0000000000000000
[   11.228097] page dumped because: unmovable page

and theoretically we might end up looping quite a long time trying to
offline memory, which would have to be canceled by the user manually
(CTRL-C).

Memtrace needs to identify+allocate multiple consecutive memory blocks.
It also has to remove the memory blocks to remove all page tables
(HW requirement).

Let's use alloc_contig_pages() to allocate memory that spans multiple
memory block devices. We can then set all pages PageOffline() to allow
these pages to get isolated. A temporary memory notifier can then make
offlining of these pages succeed by dropping its reference to the pages
on MEM_GOING_OFFLINE events(as documented in include/linux/page-flags.h
for PageOffline() pages). Error handling is a bit tricky.

Note1: ZONE_MOVABLE memory blocks won't be considered. Not sure if that
was ever really relevant. (unmovable data would end up on these memory
blocks for a tiny little time frame)

Note2: We don't have to care about online_page_callback_t, as we forbid
re-onlining from our memory notifier.

Note3: I was told this feature is never used along with DIMM-based memory
hotunplug - otherwise bad things will happen when a DIMM would try to
remove "alread-removed" memory (that is still in use).

Tested under QEMU with powernv emulation (kernel + initramfs).

$ mount -t debugfs none /sys/kernel/debug/
$ cat /sys/devices/system/memory/block_size_bytes
10000000
$ echo 0x20000000 > /sys/kernel/debug/powerpc/memtrace/enable
[   19.809790] Offlined Pages 4096
[   19.835842] Offlined Pages 4096
[   19.853136] memtrace: Allocated trace memory on node 0 at 0x0000000040000000

Unfortunately, QEMU does not support NUMA for powernv yet, so I cannot
test that.

Cc: Benjamin Herrenschmidt <benh at kernel.crashing.org>
Cc: Paul Mackerras <paulus at samba.org>
Cc: Michael Ellerman <mpe at ellerman.id.au>
Cc: Andrew Morton <akpm at linux-foundation.org>
Cc: David Hildenbrand <david at redhat.com>
Cc: Allison Randal <allison at lohutok.net>
Cc: Jens Axboe <axboe at kernel.dk>
Cc: Anshuman Khandual <anshuman.khandual at arm.com>
Cc: Thomas Gleixner <tglx at linutronix.de>
Cc: Michal Hocko <mhocko at kernel.org>
Cc: Oscar Salvador <osalvador at suse.de>
Cc: Balbir Singh <bsingharora at gmail.com>
Cc: Rashmica Gupta <rashmica.g at gmail.com>
Cc: linuxppc-dev at lists.ozlabs.org
Signed-off-by: David Hildenbrand <david at redhat.com>
---
 arch/powerpc/platforms/powernv/Kconfig    |   1 +
 arch/powerpc/platforms/powernv/memtrace.c | 175 ++++++++++++++--------
 2 files changed, 112 insertions(+), 64 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 938803eab0ad..571a0fa9f055 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -29,6 +29,7 @@ config OPAL_PRD
 config PPC_MEMTRACE
 	bool "Enable removal of RAM from kernel mappings for tracing"
 	depends on PPC_POWERNV && MEMORY_HOTREMOVE
+	select CONTIG_ALLOC
 	help
 	  Enabling this option allows for the removal of memory (RAM)
 	  from the kernel mappings to be used for hardware tracing.
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 2d2a0a2acd60..fe1e8f3926a1 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -76,83 +76,130 @@ static int memtrace_free_node(int nid, unsigned long start, unsigned long size)
 	return ret;
 }
 
-static int check_memblock_online(struct memory_block *mem, void *arg)
-{
-	if (mem->state != MEM_ONLINE)
-		return -1;
-
-	return 0;
-}
-
-static int change_memblock_state(struct memory_block *mem, void *arg)
-{
-	unsigned long state = (unsigned long)arg;
-
-	mem->state = state;
-
-	return 0;
-}
+struct memtrace_alloc_info {
+	struct notifier_block memory_notifier;
+	unsigned long base_pfn;
+	unsigned long nr_pages;
+};
 
-/* called with device_hotplug_lock held */
-static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
+static int memtrace_memory_notifier_cb(struct notifier_block *nb,
+				       unsigned long action, void *arg)
 {
-	const unsigned long start = PFN_PHYS(start_pfn);
-	const unsigned long size = PFN_PHYS(nr_pages);
-
-	if (walk_memory_blocks(start, size, NULL, check_memblock_online))
-		return false;
-
-	walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
-			   change_memblock_state);
-
-	if (offline_pages(start_pfn, nr_pages)) {
-		walk_memory_blocks(start, size, (void *)MEM_ONLINE,
-				   change_memblock_state);
-		return false;
+	struct memtrace_alloc_info *info = container_of(nb,
+						     struct memtrace_alloc_info,
+						     memory_notifier);
+	unsigned long pfn, start_pfn, end_pfn;
+	const struct memory_notify *mhp = arg;
+	static bool going_offline;
+
+	/* Ignore ranges that don't overlap. */
+	if (mhp->start_pfn + mhp->nr_pages <= info->base_pfn ||
+	    info->base_pfn + info->nr_pages <= mhp->start_pfn)
+		return NOTIFY_OK;
+
+	start_pfn = max_t(unsigned long, info->base_pfn, mhp->start_pfn);
+	end_pfn = min_t(unsigned long, info->base_pfn + info->nr_pages,
+			mhp->start_pfn + mhp->nr_pages);
+
+	/*
+	 * Drop our reference to the allocated (PageOffline()) pages, but
+	 * reaquire them in case offlining fails. We might get called for
+	 * MEM_CANCEL_OFFLINE but not for MEM_GOING_OFFLINE in case another
+	 * notifier aborted offlining.
+	 */
+	switch (action) {
+	case MEM_GOING_OFFLINE:
+		for (pfn = start_pfn; pfn < end_pfn; pfn++)
+			page_ref_dec(pfn_to_page(pfn));
+		going_offline = true;
+		break;
+	case MEM_CANCEL_OFFLINE:
+		if (going_offline)
+			for (pfn = start_pfn; pfn < end_pfn; pfn++)
+				page_ref_inc(pfn_to_page(pfn));
+		going_offline = false;
+		break;
+	case MEM_GOING_ONLINE:
+		/*
+		 * While our notifier is active, user space could
+		 * offline+re-online this memory. Disallow any such activity.
+		 */
+		return notifier_to_errno(-EBUSY);
 	}
-
-	walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
-			   change_memblock_state);
-
-
-	return true;
+	return NOTIFY_OK;
 }
 
 static u64 memtrace_alloc_node(u32 nid, u64 size)
 {
-	u64 start_pfn, end_pfn, nr_pages, pfn;
-	u64 base_pfn;
-	u64 bytes = memory_block_size_bytes();
+	const unsigned long memory_block_bytes = memory_block_size_bytes();
+	const unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct memtrace_alloc_info info = {
+		.memory_notifier = {
+			.notifier_call = memtrace_memory_notifier_cb,
+		},
+	};
+	unsigned long base_pfn, to_remove_pfn, pfn;
+	struct page *page;
+	int ret;
 
 	if (!node_spanned_pages(nid))
 		return 0;
 
-	start_pfn = node_start_pfn(nid);
-	end_pfn = node_end_pfn(nid);
-	nr_pages = size >> PAGE_SHIFT;
-
-	/* Trace memory needs to be aligned to the size */
-	end_pfn = round_down(end_pfn - nr_pages, nr_pages);
-
-	lock_device_hotplug();
-	for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
-		if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
-			/*
-			 * Remove memory in memory block size chunks so that
-			 * iomem resources are always split to the same size and
-			 * we never try to remove memory that spans two iomem
-			 * resources.
-			 */
-			end_pfn = base_pfn + nr_pages;
-			for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) {
-				__remove_memory(nid, pfn << PAGE_SHIFT, bytes);
-			}
-			unlock_device_hotplug();
-			return base_pfn << PAGE_SHIFT;
-		}
+	/*
+	 * Try to allocate memory (that might span multiple memory blocks)
+	 * on the requested node. Trace memory needs to be aligned to the size,
+	 * which is guaranteed by alloc_contig_pages().
+	 */
+	page = alloc_contig_pages(nr_pages, __GFP_THISNODE, nid, NULL);
+	if (!page)
+		return 0;
+	to_remove_pfn = base_pfn = page_to_pfn(page);
+	info.base_pfn = base_pfn;
+	info.nr_pages = nr_pages;
+
+	/* PageOffline() allows to isolate the memory when offlining. */
+	for (pfn = base_pfn; pfn < base_pfn + nr_pages; pfn++)
+		__SetPageOffline(pfn_to_page(pfn));
+
+	/* A temporary memory notifier allows to offline the isolated memory. */
+	ret = register_memory_notifier(&info.memory_notifier);
+	if (ret)
+		goto out_free_pages;
+
+	/*
+	 * Try to offline and remove all involved memory blocks. This will
+	 * only fail in the unlikely event that another memory notifier NACKs
+	 * the offlining request - no memory has to be migrated.
+	 *
+	 * Remove memory in memory block size chunks so that iomem resources
+	 * are always split to the same size and we never try to remove memory
+	 * that spans two iomem resources.
+	 */
+	for (; to_remove_pfn < base_pfn + nr_pages;
+	     to_remove_pfn += PHYS_PFN(memory_block_bytes)) {
+		ret = offline_and_remove_memory(nid, PFN_PHYS(to_remove_pfn),
+						memory_block_bytes);
+		if (ret)
+			goto out_readd_memory;
 	}
-	unlock_device_hotplug();
 
+	unregister_memory_notifier(&info.memory_notifier);
+	return PFN_PHYS(base_pfn);
+out_readd_memory:
+	/* Unregister before adding+onlining (notifer blocks onlining). */
+	unregister_memory_notifier(&info.memory_notifier);
+	if (to_remove_pfn != base_pfn) {
+		ret = memtrace_free_node(nid, PFN_PHYS(base_pfn),
+					 PFN_PHYS(to_remove_pfn - base_pfn));
+		if (ret)
+			/* Even more unlikely, log and ignore. */
+			pr_err("Failed to add trace memory to node %d\n", nid);
+	}
+out_free_pages:
+	/* Only free memory that was not temporarily offlined+removed. */
+	for (pfn = to_remove_pfn; pfn < base_pfn + nr_pages; pfn++)
+		__ClearPageOffline(pfn_to_page(pfn));
+	free_contig_range(to_remove_pfn, nr_pages - (to_remove_pfn - base_pfn));
 	return 0;
 }
 
-- 
2.23.0



More information about the Linuxppc-dev mailing list