[RFC] Remove memory from nodes for memtrace.

Rashmica Gupta rashmica.g at gmail.com
Thu Feb 23 08:39:10 AEDT 2017


 Some powerpc hardware features may want to gain access to a
 chunk of undisturbed real memory.  This update provides a means to unplug
 said memory from the kernel with a set of sysfs calls.  By writing an integer
 containing  the size of memory to be unplugged into
 /sys/kernel/debug/powerpc/memtrace/enable, the code will remove that much
 memory from the end of each available chip's memory space. In addition, the
 means to read out the contents of the unplugged memory is also provided by
 reading out the /sys/kernel/debug/powerpc/memtrace/<chip-id>/dump file.

Signed-off-by: Rashmica Gupta <rashmica.g at gmail.com>
---
Written by Douglas Lehr <dllehr at us.ibm.com>.
Have tested and seems to work as I would expect. Only change I have made from
the original is to check that the value being written to the debugfs file is
not 0 (or obscenely large), as otherwise you get a nice kernel oops where the
kernel attempts to access data at 0xfffffffe0.  

Thoughts about doing this with hot unplug or other changes?

 arch/powerpc/mm/hash_native_64.c          |  39 +++-
 arch/powerpc/platforms/powernv/Makefile   |   1 +
 arch/powerpc/platforms/powernv/memtrace.c | 285 ++++++++++++++++++++++++++++++
 3 files changed, 321 insertions(+), 4 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/memtrace.c

diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index cc33260..44cc6ce 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -3,7 +3,7 @@
  *
  * SMP scalability work:
  *    Copyright (C) 2001 Anton Blanchard <anton at au.ibm.com>, IBM
- * 
+ *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
@@ -181,7 +181,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
 	while (1) {
 		if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
 			break;
-		while(test_bit(HPTE_LOCK_BIT, word))
+		while (test_bit(HPTE_LOCK_BIT, word))
 			cpu_relax();
 	}
 }
@@ -208,10 +208,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 	}
 
 	for (i = 0; i < HPTES_PER_GROUP; i++) {
-		if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
+		if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
 			/* retry with lock held */
 			native_lock_hpte(hptep);
-			if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
+			if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID))
 				break;
 			native_unlock_hpte(hptep);
 		}
@@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
 	tlbie(vpn, psize, psize, ssize, 0);
 }
 
+/*
+ * Remove a bolted kernel entry. Memory hotplug uses this.
+ *
+ * No need to lock here because we should be the only user.
+ */
+static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
+{
+	unsigned long vpn;
+	unsigned long vsid;
+	long slot;
+	struct hash_pte *hptep;
+
+	vsid = get_kernel_vsid(ea, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
+
+	slot = native_hpte_find(vpn, psize, ssize);
+	if (slot == -1)
+		return -ENOENT;
+
+	hptep = htab_address + slot;
+
+	/* Invalidate the hpte */
+	hptep->v = 0;
+
+	/* Invalidate the TLB */
+	tlbie(vpn, psize, psize, ssize, 0);
+	return 0;
+}
+
+
 static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 				   int bpsize, int apsize, int ssize, int local)
 {
@@ -722,6 +752,7 @@ void __init hpte_init_native(void)
 	mmu_hash_ops.hpte_invalidate	= native_hpte_invalidate;
 	mmu_hash_ops.hpte_updatepp	= native_hpte_updatepp;
 	mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
+	mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
 	mmu_hash_ops.hpte_insert	= native_hpte_insert;
 	mmu_hash_ops.hpte_remove	= native_hpte_remove;
 	mmu_hash_ops.hpte_clear_all	= native_hpte_clear;
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index b5d98cb..2026661 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -11,4 +11,5 @@ obj-$(CONFIG_EEH)	+= eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
 obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
+obj-$(CONFIG_MEMORY_HOTREMOVE)         += memtrace.o
 obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
new file mode 100644
index 0000000..fdba3ee
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -0,0 +1,285 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Anton Blanchard <anton at au.ibm.com>
+ */
+
+#define pr_fmt(fmt) "powernv-memtrace: " fmt
+
+#include <linux/bitops.h>
+#include <linux/string.h>
+#include <linux/memblock.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <asm/machdep.h>
+
+struct memtrace_entry {
+	void *mem;
+	u64 start;
+	u64 size;
+	u32 nid;
+	struct dentry *dir;
+	char name[16];
+};
+
+static struct memtrace_entry *memtrace_array;
+static unsigned int memtrace_array_nr;
+
+static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
+			     size_t count, loff_t *ppos)
+{
+	struct memtrace_entry *ent = filp->private_data;
+
+	return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
+}
+
+static bool valid_memtrace_range(struct memtrace_entry *dev,
+				 unsigned long start, unsigned long size)
+{
+	if ((dev->start <= start) &&
+	    ((start + size) <= (dev->start + dev->size)))
+		return true;
+
+	return false;
+}
+
+static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct memtrace_entry *dev = filp->private_data;
+
+	if (!valid_memtrace_range(dev, vma->vm_pgoff << PAGE_SHIFT, size))
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	if (io_remap_pfn_range(vma, vma->vm_start,
+			       vma->vm_pgoff + (dev->start >> PAGE_SHIFT),
+			       size, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static const struct file_operations memtrace_fops = {
+	.llseek = default_llseek,
+	.read	= memtrace_read,
+	.mmap	= memtrace_mmap,
+	.open	= simple_open,
+};
+
+static void flush_memory_region(u64 base, u64 size)
+{
+	unsigned long line_size = ppc64_caches.dline_size;
+	u64 end = base + size;
+	u64 addr;
+
+	base = round_down(base, line_size);
+	end = round_up(end, line_size);
+
+	for (addr = base; addr < end; addr += line_size)
+		asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory");
+}
+
+static int check_memblock_online(struct memory_block *mem, void *arg)
+{
+	if (mem->state != MEM_ONLINE)
+		return -1;
+
+	return 0;
+}
+
+static int change_memblock_state(struct memory_block *mem, void *arg)
+{
+	unsigned long state = (unsigned long)arg;
+
+	mem->state = state;
+	return 0;
+}
+
+static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
+{
+	u64 end_pfn = start_pfn + nr_pages - 1;
+
+	if (walk_memory_range(start_pfn, end_pfn, NULL,
+	    check_memblock_online))
+		return false;
+
+	walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
+			  change_memblock_state);
+
+	if (offline_pages(start_pfn, nr_pages)) {
+		walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
+				  change_memblock_state);
+		return false;
+	}
+
+	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
+			  change_memblock_state);
+
+	/* RCU grace period? */
+	flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), nr_pages << PAGE_SHIFT);
+
+	remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
+
+	return true;
+}
+
+static u64 memtrace_alloc_node(u32 nid, u64 size)
+{
+	u64 start_pfn, end_pfn, nr_pages;
+	u64 base_pfn;
+
+	if (!NODE_DATA(nid) || !node_spanned_pages(nid))
+		return 0;
+
+	start_pfn = node_start_pfn(nid);
+	end_pfn = node_end_pfn(nid);
+	nr_pages = size >> PAGE_SHIFT;
+
+	/* Trace memory needs to be aligned to the size */
+	start_pfn = round_up(start_pfn, nr_pages);
+
+	for (base_pfn = start_pfn; base_pfn < end_pfn; base_pfn += nr_pages) {
+		if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true)
+			return base_pfn << PAGE_SHIFT;
+	}
+
+	return 0;
+}
+
+static int memtrace_init_regions_runtime(u64 size)
+{
+	u64 m;
+	u32 nid;
+
+	memtrace_array = kzalloc(sizeof(struct memtrace_entry) *
+				num_online_nodes(), GFP_KERNEL);
+	if (!memtrace_array) {
+		pr_err("Failed to allocate memtrace_array\n");
+		return -EINVAL;
+	}
+
+	for_each_online_node(nid) {
+		m = memtrace_alloc_node(nid, size);
+		/*
+		 * A node might not have any local memory, so warn but
+		 * continue on.
+		 */
+		if (!m) {
+			pr_err("Failed to allocate trace memory on node %d\n",
+				 nid);
+		} else {
+			pr_info("Allocated trace memory on node %d at "
+				"0x%016llx\n", nid, m);
+
+			memtrace_array[memtrace_array_nr].start = m;
+			memtrace_array[memtrace_array_nr].size = size;
+			memtrace_array[memtrace_array_nr].nid = nid;
+			memtrace_array_nr++;
+		}
+	}
+
+	return 0;
+}
+
+static struct dentry *memtrace_debugfs_dir;
+
+static int memtrace_init_debugfs(void)
+{
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < memtrace_array_nr; i++) {
+		struct dentry *dir;
+		struct memtrace_entry *ent = &memtrace_array[i];
+
+		ent->mem = ioremap(ent->start, ent->size);
+		/* Warn but continue on */
+		if (!ent->mem) {
+			pr_err("Failed to map trace memory at 0x%llx\n",
+				 ent->start);
+			ret = -1;
+			continue;
+		}
+
+		snprintf(ent->name, 16, "%08x", ent->nid);
+		dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir);
+		if (!dir)
+			return -1;
+
+		ent->dir = dir;
+		debugfs_create_file("trace", S_IRUSR, dir, ent, &memtrace_fops);
+		debugfs_create_x64("start", S_IRUSR, dir, &ent->start);
+		debugfs_create_x64("size", S_IRUSR, dir, &ent->size);
+		debugfs_create_u32("node", S_IRUSR, dir, &ent->nid);
+	}
+
+	return ret;
+}
+
+static u64 memtrace_size;
+
+static int memtrace_enable_set(void *data, u64 val)
+{
+	if (memtrace_size)
+		return -EINVAL;
+
+	if (!val)
+		return -EINVAL;
+
+	/* Make sure size is aligned to a memory block */
+	if (val & (memory_block_size_bytes()-1))
+		return -EINVAL;
+
+	if (memtrace_init_regions_runtime(val))
+		return -EINVAL;
+
+	if (memtrace_init_debugfs())
+		return -EINVAL;
+
+	memtrace_size = val;
+
+	return 0;
+}
+
+static int memtrace_enable_get(void *data, u64 *val)
+{
+	*val = memtrace_size;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get, memtrace_enable_set, "0x%016llx\n");
+
+static int memtrace_init(void)
+{
+	memtrace_debugfs_dir = debugfs_create_dir("memtrace", powerpc_debugfs_root);
+	if (!memtrace_debugfs_dir)
+		return -1;
+
+	debugfs_create_file("enable", S_IRUSR|S_IWUSR, memtrace_debugfs_dir,
+			    NULL, &memtrace_init_fops);
+
+	return 0;
+}
+machine_device_initcall(powernv, memtrace_init);
+
+
+
+/* XXX FIXME DEBUG CRAP */
+machine_device_initcall(pseries, memtrace_init);
-- 
2.9.3



More information about the Linuxppc-dev mailing list