[RFC] Remove memory from nodes for memtrace.
Oliver O'Halloran
oohall at gmail.com
Mon Feb 27 10:54:41 AEDT 2017
On Thu, Feb 23, 2017 at 8:39 AM, Rashmica Gupta <rashmica.g at gmail.com> wrote:
> Some powerpc hardware features may want to gain access to a
> chunk of undisturbed real memory. This update provides a means to unplug
> said memory from the kernel with a set of sysfs calls. By writing an integer
> containing the size of memory to be unplugged into
> /sys/kernel/debug/powerpc/memtrace/enable, the code will remove that much
> memory from the end of each available chip's memory space. In addition, the
> means to read out the contents of the unplugged memory is also provided by
> reading out the /sys/kernel/debug/powerpc/memtrace/<chip-id>/dump file.
>
> Signed-off-by: Rashmica Gupta <rashmica.g at gmail.com>
> ---
> Written by Douglas Lehr <dllehr at us.ibm.com>.
> Have tested and seems to work as I would expect. Only change I have made from
> the original is to check that the value being written to the debugfs file is
> not 0 (or obscenely large), as otherwise you get a nice kernel oops where the
> kernel attempts to access data at 0xfffffffe0.
>
> Thoughts about doing this with hot unplug or other changes?
>
> arch/powerpc/mm/hash_native_64.c | 39 +++-
> arch/powerpc/platforms/powernv/Makefile | 1 +
> arch/powerpc/platforms/powernv/memtrace.c | 285 ++++++++++++++++++++++++++++++
> 3 files changed, 321 insertions(+), 4 deletions(-)
> create mode 100644 arch/powerpc/platforms/powernv/memtrace.c
>
> diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
> index cc33260..44cc6ce 100644
> --- a/arch/powerpc/mm/hash_native_64.c
> +++ b/arch/powerpc/mm/hash_native_64.c
> @@ -3,7 +3,7 @@
> *
> * SMP scalability work:
> * Copyright (C) 2001 Anton Blanchard <anton at au.ibm.com>, IBM
> - *
> + *
> * This program is free software; you can redistribute it and/or
> * modify it under the terms of the GNU General Public License
> * as published by the Free Software Foundation; either version
> @@ -181,7 +181,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep)
> while (1) {
> if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word))
> break;
> - while(test_bit(HPTE_LOCK_BIT, word))
> + while (test_bit(HPTE_LOCK_BIT, word))
> cpu_relax();
> }
> }
> @@ -208,10 +208,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
> }
>
> for (i = 0; i < HPTES_PER_GROUP; i++) {
> - if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
> + if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) {
> /* retry with lock held */
> native_lock_hpte(hptep);
> - if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID))
> + if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID))
> break;
> native_unlock_hpte(hptep);
> }
> @@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
> tlbie(vpn, psize, psize, ssize, 0);
> }
>
> +/*
> + * Remove a bolted kernel entry. Memory hotplug uses this.
> + *
> + * No need to lock here because we should be the only user.
> + */
> +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize)
> +{
> + unsigned long vpn;
> + unsigned long vsid;
> + long slot;
> + struct hash_pte *hptep;
> +
> + vsid = get_kernel_vsid(ea, ssize);
> + vpn = hpt_vpn(ea, vsid, ssize);
> +
> + slot = native_hpte_find(vpn, psize, ssize);
> + if (slot == -1)
> + return -ENOENT;
> +
> + hptep = htab_address + slot;
> +
> + /* Invalidate the hpte */
> + hptep->v = 0;
> +
> + /* Invalidate the TLB */
> + tlbie(vpn, psize, psize, ssize, 0);
> + return 0;
> +}
> +
> +
> static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
> int bpsize, int apsize, int ssize, int local)
> {
> @@ -722,6 +752,7 @@ void __init hpte_init_native(void)
> mmu_hash_ops.hpte_invalidate = native_hpte_invalidate;
> mmu_hash_ops.hpte_updatepp = native_hpte_updatepp;
> mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
> + mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
> mmu_hash_ops.hpte_insert = native_hpte_insert;
> mmu_hash_ops.hpte_remove = native_hpte_remove;
> mmu_hash_ops.hpte_clear_all = native_hpte_clear;
> diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
> index b5d98cb..2026661 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -11,4 +11,5 @@ obj-$(CONFIG_EEH) += eeh-powernv.o
> obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
> obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
> obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o
> +obj-$(CONFIG_MEMORY_HOTREMOVE) += memtrace.o
> obj-$(CONFIG_OPAL_PRD) += opal-prd.o
> diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
> new file mode 100644
> index 0000000..fdba3ee
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/memtrace.c
> @@ -0,0 +1,285 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * Copyright (C) IBM Corporation, 2014
> + *
> + * Author: Anton Blanchard <anton at au.ibm.com>
> + */
> +
> +#define pr_fmt(fmt) "powernv-memtrace: " fmt
> +
> +#include <linux/bitops.h>
> +#include <linux/string.h>
> +#include <linux/memblock.h>
> +#include <linux/init.h>
> +#include <linux/moduleparam.h>
> +#include <linux/fs.h>
> +#include <linux/debugfs.h>
> +#include <linux/slab.h>
> +#include <linux/memory.h>
> +#include <linux/memory_hotplug.h>
> +#include <asm/machdep.h>
> +
> +struct memtrace_entry {
> + void *mem;
> + u64 start;
> + u64 size;
> + u32 nid;
> + struct dentry *dir;
> + char name[16];
> +};
> +
> +static struct memtrace_entry *memtrace_array;
> +static unsigned int memtrace_array_nr;
> +
> +static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
> + size_t count, loff_t *ppos)
> +{
> + struct memtrace_entry *ent = filp->private_data;
> +
> + return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
> +}
> +
> +static bool valid_memtrace_range(struct memtrace_entry *dev,
> + unsigned long start, unsigned long size)
> +{
> + if ((dev->start <= start) &&
> + ((start + size) <= (dev->start + dev->size)))
> + return true;
> +
> + return false;
> +}
> +
> +static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> + unsigned long size = vma->vm_end - vma->vm_start;
> + struct memtrace_entry *dev = filp->private_data;
> +
> + if (!valid_memtrace_range(dev, vma->vm_pgoff << PAGE_SHIFT, size))
> + return -EINVAL;
> +
> + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +
> + if (io_remap_pfn_range(vma, vma->vm_start,
> + vma->vm_pgoff + (dev->start >> PAGE_SHIFT),
> + size, vma->vm_page_prot))
> + return -EAGAIN;
> +
> + return 0;
> +}
> +
> +static const struct file_operations memtrace_fops = {
> + .llseek = default_llseek,
> + .read = memtrace_read,
> + .mmap = memtrace_mmap,
> + .open = simple_open,
> +};
> +
> +static void flush_memory_region(u64 base, u64 size)
> +{
> + unsigned long line_size = ppc64_caches.dline_size;
> + u64 end = base + size;
> + u64 addr;
> +
> + base = round_down(base, line_size);
> + end = round_up(end, line_size);
> +
> + for (addr = base; addr < end; addr += line_size)
> + asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory");
> +}
I think this is the same as flush_inval_dcache_range()?
> +
> +static int check_memblock_online(struct memory_block *mem, void *arg)
> +{
> + if (mem->state != MEM_ONLINE)
> + return -1;
> +
> + return 0;
> +}
> +
> +static int change_memblock_state(struct memory_block *mem, void *arg)
> +{
> + unsigned long state = (unsigned long)arg;
> +
> + mem->state = state;
> + return 0;
> +}
> +
> +static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
> +{
> + u64 end_pfn = start_pfn + nr_pages - 1;
> +
> + if (walk_memory_range(start_pfn, end_pfn, NULL,
> + check_memblock_online))
> + return false;
> +
> + walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
> + change_memblock_state);
> +
> + if (offline_pages(start_pfn, nr_pages)) {
> + walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
> + change_memblock_state);
> + return false;
> + }
> +
> + walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
> + change_memblock_state);
> +
> + /* RCU grace period? */
> + flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), nr_pages << PAGE_SHIFT);
> +
> + remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
This concerns me slightly. We're relying on flush_memory_region() to
ensure the whole region is out of the cache, but until it's been
unmapped the processor can still do speculative loads from this
region. Given the memory will be re-mapped as cache inhibited this is
opening a window for cache paradoxes. It might not be an issue, but if
you encounter random checkstops while testing we should look into
hardening this.
> +
> + return true;
> +}
> +
> +static u64 memtrace_alloc_node(u32 nid, u64 size)
> +{
> + u64 start_pfn, end_pfn, nr_pages;
> + u64 base_pfn;
> +
> + if (!NODE_DATA(nid) || !node_spanned_pages(nid))
> + return 0;
> +
> + start_pfn = node_start_pfn(nid);
> + end_pfn = node_end_pfn(nid);
> + nr_pages = size >> PAGE_SHIFT;
> +
> + /* Trace memory needs to be aligned to the size */
> + start_pfn = round_up(start_pfn, nr_pages);
> +
> + for (base_pfn = start_pfn; base_pfn < end_pfn; base_pfn += nr_pages) {
> + if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true)
> + return base_pfn << PAGE_SHIFT;
> + }
> +
> + return 0;
> +}
> +
> +static int memtrace_init_regions_runtime(u64 size)
> +{
> + u64 m;
> + u32 nid;
> +
> + memtrace_array = kzalloc(sizeof(struct memtrace_entry) *
> + num_online_nodes(), GFP_KERNEL);
> + if (!memtrace_array) {
> + pr_err("Failed to allocate memtrace_array\n");
> + return -EINVAL;
> + }
> +
> + for_each_online_node(nid) {
> + m = memtrace_alloc_node(nid, size);
> + /*
> + * A node might not have any local memory, so warn but
> + * continue on.
> + */
> + if (!m) {
> + pr_err("Failed to allocate trace memory on node %d\n",
> + nid);
> + } else {
> + pr_info("Allocated trace memory on node %d at "
> + "0x%016llx\n", nid, m);
> +
> + memtrace_array[memtrace_array_nr].start = m;
> + memtrace_array[memtrace_array_nr].size = size;
> + memtrace_array[memtrace_array_nr].nid = nid;
> + memtrace_array_nr++;
> + }
> + }
> +
> + return 0;
> +}
> +
> +static struct dentry *memtrace_debugfs_dir;
> +
> +static int memtrace_init_debugfs(void)
> +{
> + int ret = 0;
> + int i;
> +
> + for (i = 0; i < memtrace_array_nr; i++) {
> + struct dentry *dir;
> + struct memtrace_entry *ent = &memtrace_array[i];
> +
> + ent->mem = ioremap(ent->start, ent->size);
> + /* Warn but continue on */
> + if (!ent->mem) {
> + pr_err("Failed to map trace memory at 0x%llx\n",
> + ent->start);
> + ret = -1;
> + continue;
> + }
> +
> + snprintf(ent->name, 16, "%08x", ent->nid);
> + dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir);
> + if (!dir)
> + return -1;
> +
> + ent->dir = dir;
> + debugfs_create_file("trace", S_IRUSR, dir, ent, &memtrace_fops);
> + debugfs_create_x64("start", S_IRUSR, dir, &ent->start);
> + debugfs_create_x64("size", S_IRUSR, dir, &ent->size);
> + debugfs_create_u32("node", S_IRUSR, dir, &ent->nid);
> + }
> +
> + return ret;
> +}
> +
> +static u64 memtrace_size;
> +
> +static int memtrace_enable_set(void *data, u64 val)
> +{
> + if (memtrace_size)
> + return -EINVAL;
> +
> + if (!val)
> + return -EINVAL;
> +
> + /* Make sure size is aligned to a memory block */
> + if (val & (memory_block_size_bytes()-1))
> + return -EINVAL;
Printing a warning here with the expected alignment would be helpful .
> +
> + if (memtrace_init_regions_runtime(val))
> + return -EINVAL;
> +
> + if (memtrace_init_debugfs())
> + return -EINVAL;
> +
> + memtrace_size = val;
> +
> + return 0;
> +}
I know this RFC is mostly about how the offlining path should be
handled, but can you sketch out the re-onlining path too?
More information about the Linuxppc-dev
mailing list