[Skiboot] [RFC PATCH 5/5] virtual memory for OPAL boot

Nicholas Piggin npiggin at gmail.com
Mon Dec 9 22:21:38 AEDT 2019


vm_map_global / vm_unmap_global sets up globally visible 1:1 mappings.
vm_map / vm_unmap creates a per-cpu mapping which can not nest.

A list of global extents + a local extent per cpu is kept to describe
active mappings. Fault handlers look these up to install SLB/HPTE entries.

Without much further change, these mappings can be used to provide virtual
memory mappings for the runtime component of skiboot, if the host OS is
modified to provide virtual memory services for OPAL.

Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
---
 core/Makefile.inc    |   2 +-
 core/cpu.c           |  22 +-
 core/exceptions.c    |  68 ++++
 core/fast-reboot.c   |  14 +-
 core/init.c          | 173 +++++++--
 core/mem_region.c    |  75 +++-
 core/opal.c          |  38 +-
 core/platform.c      |  15 +-
 core/vm.c            | 899 +++++++++++++++++++++++++++++++++++++++++++
 hdata/spira.c        |  35 +-
 hw/fake-nvram.c      |  12 +-
 hw/homer.c           |  15 +-
 hw/lpc-uart.c        |  32 +-
 hw/lpc.c             |   6 +
 hw/phb4.c            |   9 +-
 hw/psi.c             |   2 +
 hw/slw.c             |   4 +-
 hw/xive.c            |   5 +
 hw/xscom.c           |   4 +
 include/cmpxchg.h    |   3 +
 include/cpu.h        |  22 ++
 include/elf-abi.h    |  21 +-
 include/io.h         | 117 +++++-
 include/mem_region.h |   1 +
 include/platform.h   |   4 +-
 include/processor.h  |  13 +-
 include/skiboot.h    |  27 ++
 libstb/container.c   |  12 +-
 libstb/cvc.c         |   3 +
 libstb/secureboot.c  |   5 +-
 libstb/trustedboot.c |   6 +-
 skiboot.lds.S        |  26 +-
 32 files changed, 1557 insertions(+), 133 deletions(-)
 create mode 100644 core/vm.c

diff --git a/core/Makefile.inc b/core/Makefile.inc
index fddff50e9..c2b5251d7 100644
--- a/core/Makefile.inc
+++ b/core/Makefile.inc
@@ -3,7 +3,7 @@
 # -*-Makefile-*-
 
 SUBDIRS += core
-CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
+CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o vm.o
 CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
 CORE_OBJS += opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
 CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
diff --git a/core/cpu.c b/core/cpu.c
index 0c13f29de..cb4319ec8 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -409,6 +409,10 @@ static unsigned int cpu_idle_p8(enum cpu_wake_cause wake_on)
 	}
 	isync();
 
+	/* P8 must enter nap with VM disabled */
+	if (cpu->vm_setup)
+		vm_exit();
+
 	/* Enter nap */
 	vec = enter_p8_pm_state(false);
 
@@ -469,11 +473,19 @@ static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
 		/* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */
 		psscr = PPC_BIT(42) | PPC_BIT(43) |
 			PPC_BITMASK(54, 55) | PPC_BIT(63);
+		/*
+		 * stop with EC=1 wakes with vm off. P9 can stop with vm
+		 * enabled, but it's simpler to disable now and so it wakes
+		 * in the proper state.
+		 */
+		if (cpu->vm_setup)
+			vm_exit();
 		vec = enter_p9_pm_state(psscr);
 	} else {
 		/* stop with EC=0 (resumes) which does not require sreset. */
 		/* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */
 		psscr = PPC_BITMASK(54, 55) | PPC_BIT(63);
+		/* Can run with VM enabled */
 		enter_p9_pm_lite_state(psscr);
 	}
 
@@ -492,6 +504,7 @@ static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
 static void cpu_idle_pm(enum cpu_wake_cause wake_on)
 {
 	unsigned int vec;
+	bool was_vm_setup = this_cpu()->vm_setup;
 
 	switch(proc_gen) {
 	case proc_gen_p8:
@@ -516,12 +529,17 @@ static void cpu_idle_pm(enum cpu_wake_cause wake_on)
 		default:
 			break;
 		}
-		mtmsrd(MSR_RI, 1);
 
 	} else if (vec == 0x200) {
 		exception_entry_pm_mce();
 		enable_machine_check();
+	}
+
+	if (vec != 0) {
+		/* 0x100 or 0x200 */
 		mtmsrd(MSR_RI, 1);
+		if (was_vm_setup)
+			vm_enter();
 	}
 }
 
@@ -1354,7 +1372,7 @@ static int64_t opal_return_cpu(void)
 		printf("OPAL in_opal_call=%u\n", this_cpu()->in_opal_call);
 	}
 
-	__secondary_cpu_entry();
+	__return_cpu_entry();
 
 	return OPAL_HARDWARE; /* Should not happen */
 }
diff --git a/core/exceptions.c b/core/exceptions.c
index 4ff7a9e4b..29d28700f 100644
--- a/core/exceptions.c
+++ b/core/exceptions.c
@@ -32,8 +32,25 @@ static void dump_regs(struct stack_frame *stack)
 
 #define EXCEPTION_MAX_STR 320
 
+static void print_recoverable_mce_vm(struct stack_frame *stack, uint64_t nip, uint64_t msr)
+{
+	char buf[EXCEPTION_MAX_STR];
+	size_t l;
+
+	l = 0;
+	l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+		"Recoverable MCE with VM on at "REG"   ", nip);
+	l += snprintf_symbol(buf + l, EXCEPTION_MAX_STR - l, nip);
+	l += snprintf(buf + l, EXCEPTION_MAX_STR - l, "  MSR "REG, msr);
+	prerror("%s\n", buf);
+	dump_regs(stack);
+	prerror("Continuing with VM off\n");
+}
+
 void exception_entry(struct stack_frame *stack)
 {
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
 	bool fatal = false;
 	bool hv;
 	uint64_t nip;
@@ -41,6 +58,8 @@ void exception_entry(struct stack_frame *stack)
 	char buf[EXCEPTION_MAX_STR];
 	size_t l;
 
+	c->vm_setup = false;
+
 	switch (stack->type) {
 	case 0x500:
 	case 0x980:
@@ -83,12 +102,58 @@ void exception_entry(struct stack_frame *stack)
 		break;
 
 	case 0x200:
+		if (this_cpu()->vm_local_map_inuse)
+			fatal = true; /* local map is non-linear */
+
+		if (!fatal && (msr & (MSR_IR|MSR_DR))) {
+			print_recoverable_mce_vm(stack, nip, msr);
+			/* Turn off VM and try again */
+			vm_setup = false;
+			stack->srr1 &= ~(MSR_IR|MSR_DR);
+			goto out;
+		}
+
 		fatal = true;
 		prerror("***********************************************\n");
 		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
 			"Fatal MCE at "REG"   ", nip);
 		break;
 
+	case 0x300:
+		if (vm_dsi(nip, stack->dar, !!(stack->dsisr & DSISR_ISSTORE)))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal %s address "REG" at "REG"   ",
+			(stack->dsisr & DSISR_ISSTORE) ? "store" : "load",
+			stack->dar, nip);
+		break;
+
+	case 0x380:
+		if (vm_dslb(nip, stack->dar))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal load/store address "REG" at "REG"   ",
+			stack->dar, nip);
+		break;
+
+	case 0x400:
+		if (vm_isi(nip))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal ifetch at "REG"   ", nip);
+		break;
+
+	case 0x480:
+		if (vm_islb(nip))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal ifetch at "REG"   ", nip);
+		break;
+
 	case 0x700: {
 		struct trap_table_entry *tte;
 
@@ -136,11 +201,14 @@ void exception_entry(struct stack_frame *stack)
 		for (;;) ;
 	}
 
+out:
+	assert(!fatal);
 	if (hv) {
 		/* Set up for SRR return */
 		stack->srr0 = nip;
 		stack->srr1 = msr;
 	}
+	c->vm_setup = vm_setup;
 }
 
 void exception_entry_pm_sreset(void)
diff --git a/core/fast-reboot.c b/core/fast-reboot.c
index ea1375efd..65594efd8 100644
--- a/core/fast-reboot.c
+++ b/core/fast-reboot.c
@@ -361,6 +361,9 @@ void __noreturn fast_reboot_entry(void)
 	cpu_set_sreset_enable(true);
 	cpu_set_ipi_enable(true);
 
+	/* Enter virtual memory mode */
+	vm_init(true);
+
 	prlog(PR_INFO, "RESET: Releasing secondaries...\n");
 
 	/* Release everybody */
@@ -381,6 +384,7 @@ void __noreturn fast_reboot_entry(void)
 	fast_boot_release = false;
 
 	if (!chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+		void *t;
 		/*
 		 * mem_region_clear_unused avoids these preload regions
 		 * so it can run along side image preloading. Clear these
@@ -390,8 +394,14 @@ void __noreturn fast_reboot_entry(void)
 		 * Mambo may have embedded payload here, so don't clear
 		 * it at all.
 		 */
-		memset(KERNEL_LOAD_BASE, 0, KERNEL_LOAD_SIZE);
-		memset(INITRAMFS_LOAD_BASE, 0, INITRAMFS_LOAD_SIZE);
+
+		t = vm_map((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true);
+		memset(t, 0, KERNEL_LOAD_SIZE);
+		vm_unmap((unsigned long)t, KERNEL_LOAD_SIZE);
+
+		t = vm_map((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true);
+		memset(t, 0, INITRAMFS_LOAD_SIZE);
+		vm_unmap((unsigned long)t, INITRAMFS_LOAD_SIZE);
 	}
 
 	/* Start preloading kernel and ramdisk */
diff --git a/core/init.c b/core/init.c
index 1725639f8..6bc464407 100644
--- a/core/init.c
+++ b/core/init.c
@@ -93,6 +93,7 @@ static bool try_load_elf64_le(struct elf_hdr *header)
 	uint64_t load_base = (uint64_t)kh;
 	struct elf64le_phdr *ph;
 	unsigned int i;
+	bool ret = false;
 
 	printf("INIT: 64-bit LE kernel discovered\n");
 
@@ -104,6 +105,9 @@ static bool try_load_elf64_le(struct elf_hdr *header)
 	 * but it will not work for any ELF binary.
 	 */
 	ph = (struct elf64le_phdr *)(load_base + le64_to_cpu(kh->e_phoff));
+	vm_map_global("KERNEL ELF Program Headers", (unsigned long)ph,
+			le16_to_cpu(kh->e_phnum)*sizeof(struct elf64le_phdr),
+			false, false);
 	for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) {
 		if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
 			continue;
@@ -120,7 +124,7 @@ static bool try_load_elf64_le(struct elf_hdr *header)
 
 	if (!kernel_entry) {
 		prerror("INIT: Failed to find kernel entry !\n");
-		return false;
+		goto out_unmap;
 	}
 	kernel_entry += load_base;
 	kernel_32bit = false;
@@ -132,7 +136,12 @@ static bool try_load_elf64_le(struct elf_hdr *header)
 	prlog(PR_DEBUG, "INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
 	      kernel_entry, kernel_size);
 
-	return true;
+	ret = true;
+
+out_unmap:
+	vm_unmap_global((unsigned long)ph, le16_to_cpu(kh->e_phnum)*sizeof(struct elf64le_phdr));
+
+	return ret;
 }
 
 static bool try_load_elf64(struct elf_hdr *header)
@@ -143,12 +152,17 @@ static bool try_load_elf64(struct elf_hdr *header)
 	struct elf64be_phdr *ph;
 	struct elf64be_shdr *sh;
 	unsigned int i;
+	bool ret = false;
+
+	vm_map_global("KERNEL ELF64 Header", (unsigned long)header,
+			sizeof(struct elf64be_hdr), false, false);
 
 	/* Check it's a ppc64 LE ELF */
 	if (khle->ei_ident == ELF_IDENT		&&
 	    khle->ei_data == ELF_DATA_LSB	&&
 	    le16_to_cpu(khle->e_machine) == ELF_MACH_PPC64) {
-		return try_load_elf64_le(header);
+		ret = try_load_elf64_le(header);
+		goto out_unmap1;
 	}
 
 	/* Check it's a ppc64 ELF */
@@ -156,7 +170,7 @@ static bool try_load_elf64(struct elf_hdr *header)
 	    kh->ei_data != ELF_DATA_MSB		||
 	    be16_to_cpu(kh->e_machine) != ELF_MACH_PPC64) {
 		prerror("INIT: Kernel doesn't look like an ppc64 ELF\n");
-		return false;
+		goto out_unmap1;
 	}
 
 	/* Look for a loadable program header that has our entry in it
@@ -167,6 +181,8 @@ static bool try_load_elf64(struct elf_hdr *header)
 	 * but it will not work for any ELF binary.
 	 */
 	ph = (struct elf64be_phdr *)(load_base + be64_to_cpu(kh->e_phoff));
+	vm_map_global("KERNEL ELF Program Headers", (unsigned long)ph,
+			be16_to_cpu(kh->e_phnum)*sizeof(struct elf64be_phdr), false, false);
 	for (i = 0; i < be16_to_cpu(kh->e_phnum); i++, ph++) {
 		if (be32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
 			continue;
@@ -183,7 +199,7 @@ static bool try_load_elf64(struct elf_hdr *header)
 
 	if (!kernel_entry) {
 		prerror("INIT: Failed to find kernel entry !\n");
-		return false;
+		goto out_unmap2;
 	}
 
 	/* For the normal big-endian ELF ABI, the kernel entry points
@@ -193,6 +209,8 @@ static bool try_load_elf64(struct elf_hdr *header)
 	 * to assuming it obeys the ABI.
 	 */
 	sh = (struct elf64be_shdr *)(load_base + be64_to_cpu(kh->e_shoff));
+	vm_map_global("KERNEL ELF Section Headers", (unsigned long)sh,
+			be16_to_cpu(kh->e_shnum)*sizeof(struct elf64be_shdr), false, false);
 	for (i = 0; i < be16_to_cpu(kh->e_shnum); i++, sh++) {
 		if (be64_to_cpu(sh->sh_addr) <= be64_to_cpu(kh->e_entry) &&
 		    (be64_to_cpu(sh->sh_addr) + be64_to_cpu(sh->sh_size)) >
@@ -217,7 +235,15 @@ static bool try_load_elf64(struct elf_hdr *header)
 	printf("INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
 	       kernel_entry, kernel_size);
 
-	return true;
+	ret = true;
+
+	vm_unmap_global((unsigned long)sh, be16_to_cpu(kh->e_shnum)*sizeof(struct elf64be_shdr));
+out_unmap2:
+	vm_unmap_global((unsigned long)ph, be16_to_cpu(kh->e_phnum)*sizeof(struct elf64be_phdr));
+out_unmap1:
+	vm_unmap_global((unsigned long)header, sizeof(struct elf64be_hdr));
+
+	return ret;
 }
 
 static bool try_load_elf32_le(struct elf_hdr *header)
@@ -333,6 +359,7 @@ bool start_preload_kernel(void)
 	int loaded;
 
 	/* Try to load an external kernel payload through the platform hooks */
+	vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true, false);
 	kernel_size = KERNEL_LOAD_SIZE;
 	loaded = start_preload_resource(RESOURCE_ID_KERNEL,
 					RESOURCE_SUBID_NONE,
@@ -341,9 +368,11 @@ bool start_preload_kernel(void)
 	if (loaded != OPAL_SUCCESS) {
 		printf("INIT: platform start load kernel failed\n");
 		kernel_size = 0;
+		vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
 		return false;
 	}
 
+	vm_map_global("INITRAMFS", (unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true, false);
 	initramfs_size = INITRAMFS_LOAD_SIZE;
 	loaded = start_preload_resource(RESOURCE_ID_INITRAMFS,
 					RESOURCE_SUBID_NONE,
@@ -351,6 +380,7 @@ bool start_preload_kernel(void)
 	if (loaded != OPAL_SUCCESS) {
 		printf("INIT: platform start load initramfs failed\n");
 		initramfs_size = 0;
+		vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
 		return false;
 	}
 
@@ -360,13 +390,16 @@ bool start_preload_kernel(void)
 static bool load_kernel(void)
 {
 	void *stb_container = NULL;
-	struct elf_hdr *kh;
+	struct elf_hdr *kh, *t;
+	uint32_t ei_ident;
+	uint8_t ei_class;
 	int loaded;
 
 	prlog(PR_NOTICE, "INIT: Waiting for kernel...\n");
 
 	loaded = wait_for_resource_loaded(RESOURCE_ID_KERNEL,
 					  RESOURCE_SUBID_NONE);
+	vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
 
 	if (loaded != OPAL_SUCCESS) {
 		printf("INIT: platform wait for kernel load failed\n");
@@ -382,8 +415,10 @@ static bool load_kernel(void)
 				((uint64_t)__builtin_kernel_start) -
 				SKIBOOT_BASE + boot_offset;
 			printf("Using built-in kernel\n");
+			vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, kernel_size, true, false);
 			memmove(KERNEL_LOAD_BASE, (void*)builtin_base,
 				kernel_size);
+			vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, kernel_size);
 		}
 	}
 
@@ -399,7 +434,7 @@ static bool load_kernel(void)
 		if (kernel_entry < EXCEPTION_VECTORS_END) {
 			cpu_set_sreset_enable(false);
 			memcpy_null(NULL, old_vectors, EXCEPTION_VECTORS_END);
-			sync_icache();
+			sync_icache(0);
 		} else {
 			/* Hack for STB in Mambo, assume at least 4kb in mem */
 			if (!kernel_size)
@@ -430,15 +465,20 @@ static bool load_kernel(void)
 	      "INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n",
 	      kernel_size);
 
-	if (kh->ei_ident != ELF_IDENT) {
+	t = vm_map((unsigned long)kh, sizeof(*kh), false);
+	ei_ident = t->ei_ident;
+	ei_class = t->ei_class;
+	vm_unmap((unsigned long)t, sizeof(*kh));
+
+	if (ei_ident != ELF_IDENT) {
 		prerror("INIT: ELF header not found. Assuming raw binary.\n");
 		return true;
 	}
 
-	if (kh->ei_class == ELF_CLASS_64) {
+	if (ei_class == ELF_CLASS_64) {
 		if (!try_load_elf64(kh))
 			return false;
-	} else if (kh->ei_class == ELF_CLASS_32) {
+	} else if (ei_class == ELF_CLASS_32) {
 		if (!try_load_elf32(kh))
 			return false;
 	} else {
@@ -466,7 +506,7 @@ static void load_initramfs(void)
 
 	loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
 					  RESOURCE_SUBID_NONE);
-
+	vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
 	if (loaded != OPAL_SUCCESS || !initramfs_size)
 		return;
 
@@ -538,6 +578,7 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
 	const struct dt_property *memprop;
 	const char *cmdline, *stdoutp;
 	uint64_t mem_top;
+	uint32_t *t;
 
 	memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem");
 	if (memprop)
@@ -612,11 +653,13 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
 
 	fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir);
 
+	t = vm_map(kernel_entry, 4, false);
 	/* Check there is something there before we branch to it */
-	if (*(uint32_t *)kernel_entry == 0) {
+	if (*t == 0) {
 		prlog(PR_EMERG, "FATAL: Kernel is zeros, can't execute!\n");
 		assert(0);
 	}
+	vm_unmap(kernel_entry, 4);
 
 	if (platform.exit)
 		platform.exit();
@@ -628,7 +671,10 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
 	printf("INIT: Starting kernel at 0x%llx, fdt at %p %u bytes\n",
 	       kernel_entry, fdt, fdt_totalsize(fdt));
 
-	/* Disable machine checks on all */
+	/* Go back to realmode and tear down our VM before booting kernel */
+	vm_destroy();
+
+	/* Disable machine checks, RI on all */
 	cpu_disable_ME_RI_all();
 
 	patch_traps(false);
@@ -834,37 +880,60 @@ static void setup_branch_null_catcher(void)
 
 void copy_sreset_vector(void)
 {
+	static char patch[0x100];
 	uint32_t *src, *dst;
+	uint32_t *t;
+	uint32_t len = (void *)&reset_patch_end - (void *)&reset_patch_start;
 
 	/* Copy the reset code over the entry point. */
 	src = &reset_patch_start;
+	t = vm_map((unsigned long)src, len, false);
+	memcpy(patch, t, len);
+	vm_unmap((unsigned long)src, len);
+
 	dst = (uint32_t *)0x100;
-	while(src < &reset_patch_end)
-		*(dst++) = *(src++);
-	sync_icache();
+	t = vm_map((unsigned long)dst, len, true);
+	memcpy(t, patch, len);
+	sync_icache((unsigned long)t);
+	vm_unmap((unsigned long)dst, len);
 }
 
 void copy_sreset_vector_fast_reboot(void)
 {
+	static char patch[0x100];
 	uint32_t *src, *dst;
+	uint32_t *t;
+	uint32_t len = (void *)&reset_fast_reboot_patch_end -
+			(void *)&reset_fast_reboot_patch_start;
 
 	/* Copy the reset code over the entry point. */
 	src = &reset_fast_reboot_patch_start;
+	t = vm_map((unsigned long)src, len, false);
+	memcpy(patch, t, len);
+	vm_unmap((unsigned long)src, len);
+
 	dst = (uint32_t *)0x100;
-	while(src < &reset_fast_reboot_patch_end)
-		*(dst++) = *(src++);
-	sync_icache();
+	t = vm_map((unsigned long)dst, len, true);
+	memcpy(t, patch, len);
+	sync_icache((unsigned long)t);
+	vm_unmap((unsigned long)dst, len);
 }
 
 void copy_exception_vectors(void)
 {
+	void *t;
+
+	t = vm_map(0x0, EXCEPTION_VECTORS_END, true);
+
 	/* Copy from 0x100 to EXCEPTION_VECTORS_END, avoid below 0x100 as
 	 * this is the boot flag used by CPUs still potentially entering
 	 * skiboot.
 	 */
-	memcpy((void *)0x100, (void *)(SKIBOOT_BASE + 0x100),
+	memcpy(t + 0x100, (void *)(SKIBOOT_BASE + 0x100),
 			EXCEPTION_VECTORS_END - 0x100);
-	sync_icache();
+
+	sync_icache((unsigned long)t);
+	vm_unmap(0x0, EXCEPTION_VECTORS_END);
 }
 
 /*
@@ -878,15 +947,16 @@ void patch_traps(bool enable)
 	for (tte = __trap_table_start; tte < __trap_table_end; tte++) {
 		uint32_t *insn;
 
-		insn = (uint32_t *)tte->address;
+		insn = vm_map(tte->address, sizeof(uint32_t), true);
 		if (enable) {
 			*insn = PPC_INST_TRAP;
 		} else {
 			*insn = PPC_INST_NOP;
 		}
+		sync_icache((unsigned long)insn);
+		vm_unmap(tte->address, sizeof(uint32_t));
 	}
 
-	sync_icache();
 }
 
 static void per_thread_sanity_checks(void)
@@ -936,19 +1006,22 @@ void pci_nvram_init(void)
 static uint32_t mem_csum(void *_p, void *_e)
 {
 	size_t len = _e - _p;
-	uint32_t *p = _p;
+	uint32_t *t;
 	uint32_t v1 = 0, v2 = 0;
 	uint32_t csum;
 	unsigned int i;
 
+	t = vm_map((unsigned long)_p, len, false);
+
 	for (i = 0; i < len; i += 4) {
-		uint32_t v = *p++;
+		uint32_t v = *t++;
 		v1 += v;
 		v2 += v1;
 	}
-
 	csum = v1 ^ v2;
 
+	vm_unmap((unsigned long)_p, len);
+
 	return csum;
 }
 
@@ -962,6 +1035,8 @@ static void checksum_romem(void)
 	if (chip_quirk(QUIRK_SLOW_SIM))
 		return;
 
+	/* Called in real mode */
+
 	csum = mem_csum(_start, _head_end);
 	romem_csum ^= csum;
 
@@ -1053,7 +1128,7 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	prlog(PR_DEBUG, "initial console log level: memory %d, driver %d\n",
 	       (debug_descriptor.console_log_levels >> 4),
 	       (debug_descriptor.console_log_levels & 0x0f));
-	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology.\n");
+	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology. Now with more indirection.\n");
 
 #ifdef SKIBOOT_GCOV
 	skiboot_gcov_done();
@@ -1065,6 +1140,9 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	/* Now locks can be used */
 	init_locks();
 
+	/* Enter virtual memory mode */
+	vm_init(false);
+
 	/* Create the OPAL call table early on, entries can be overridden
 	 * later on (FSP console code for example)
 	 */
@@ -1090,7 +1168,20 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
 		if (parse_hdat(false) < 0)
 			abort();
 	} else {
+		void *t;
+		uint32_t size;
+
+		t = vm_map((unsigned long)fdt, sizeof(struct fdt_header), false);
+		size = fdt_totalsize(t);
+		vm_unmap((unsigned long)fdt, sizeof(struct fdt_header));
+
+		/*
+		 * Would be nice to make this a local map, but it seems
+		 * to need to be expanded in place.
+		 */
+		vm_map_global("fdt", (unsigned long)fdt, size, false, false);
 		dt_expand(fdt);
+		vm_unmap_global((unsigned long)fdt, size);
 	}
 	dt_add_cpufeatures(dt_root);
 
@@ -1141,6 +1232,8 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	 */
 	init_cpu_max_pir();
 
+	vm_init_stacks();
+
 	/*
 	 * Now, we init our memory map from the device-tree, and immediately
 	 * reserve areas which we know might contain data coming from
@@ -1382,6 +1475,30 @@ void __noreturn __secondary_cpu_entry(void)
 	enable_machine_check();
 	mtmsrd(MSR_RI, 1);
 
+	vm_init_secondary();
+
+	/* Some XIVE setup */
+	xive_cpu_callin(cpu);
+
+	/* Wait for work to do */
+	while(true) {
+		if (cpu_check_jobs(cpu))
+			cpu_process_jobs();
+		else
+			cpu_idle_job();
+	}
+}
+
+void __noreturn __return_cpu_entry(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	/* Secondary CPU called in */
+	cpu_callin(cpu);
+
+	enable_machine_check();
+	mtmsrd(MSR_RI, 1);
+
 	/* Some XIVE setup */
 	xive_cpu_callin(cpu);
 
diff --git a/core/mem_region.c b/core/mem_region.c
index 8eda30598..16fb020cc 100644
--- a/core/mem_region.c
+++ b/core/mem_region.c
@@ -25,7 +25,7 @@
 #define POISON_MEM_REGION	0
 #endif
 #define POISON_MEM_REGION_WITH	0x99
-#define POISON_MEM_REGION_LIMIT 1*1024*1024*1024
+#define POISON_MEM_REGION_LIMIT 128*1024*1024
 
 /* Locking: The mem_region_lock protects the regions list from concurrent
  * updates. Additions to, or removals from, the region list must be done
@@ -57,24 +57,27 @@ static struct mem_region skiboot_os_reserve = {
 	.type		= REGION_OS,
 };
 
-struct mem_region skiboot_heap = {
-	.name		= "ibm,firmware-heap",
-	.start		= HEAP_BASE,
-	.len		= HEAP_SIZE,
-	.type		= REGION_SKIBOOT_HEAP,
-};
-
 static struct mem_region skiboot_code_and_text = {
 	.name		= "ibm,firmware-code",
 	.start		= SKIBOOT_BASE,
 	.len		= HEAP_BASE - SKIBOOT_BASE,
+	.vm_mapped_len	= HEAP_BASE - SKIBOOT_BASE,
 	.type		= REGION_SKIBOOT_FIRMWARE,
 };
 
+struct mem_region skiboot_heap = {
+	.name		= "ibm,firmware-heap",
+	.start		= HEAP_BASE,
+	.len		= HEAP_SIZE,
+	.vm_mapped_len	= HEAP_SIZE,
+	.type		= REGION_SKIBOOT_HEAP,
+};
+
 static struct mem_region skiboot_after_heap = {
 	.name		= "ibm,firmware-data",
 	.start		= HEAP_BASE + HEAP_SIZE,
 	.len		= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
+	.vm_mapped_len	= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
 	.type		= REGION_SKIBOOT_FIRMWARE,
 };
 
@@ -145,13 +148,16 @@ static struct alloc_hdr *next_hdr(const struct mem_region *region,
 static void mem_poison(struct free_hdr *f)
 {
 	size_t poison_size = (void*)tailer(f) - (void*)(f+1);
+	void *t;
 
 	/* We only poison up to a limit, as otherwise boot is
 	 * kinda slow */
 	if (poison_size > POISON_MEM_REGION_LIMIT)
 		poison_size = POISON_MEM_REGION_LIMIT;
 
-	memset(f+1, POISON_MEM_REGION_WITH, poison_size);
+	t = vm_map((unsigned long)(f+1), poison_size, true);
+	memset(t, POISON_MEM_REGION_WITH, poison_size);
+	vm_unmap((unsigned long)(f+1), poison_size);
 }
 #endif
 
@@ -159,23 +165,42 @@ static void mem_poison(struct free_hdr *f)
 static void init_allocatable_region(struct mem_region *region)
 {
 	struct free_hdr *f = region_start(region);
+	unsigned long num_longs;
+	unsigned long *t;
+
 	assert(region->type == REGION_SKIBOOT_HEAP ||
 	       region->type == REGION_MEMORY);
-	f->hdr.num_longs = region->len / sizeof(long);
+
+	num_longs = region->len / sizeof(long);
+
+	if (!region->vm_mapped_len) {
+		/* SKIBOOT_BASE-SIZE regions already come mapped */
+		region->vm_mapped_len = PAGE_SIZE;
+		vm_map_global(region->name, region->start, PAGE_SIZE, true, false);
+	}
+
+	assert(PAGE_SIZE >= sizeof(*f));
+	assert(region->len >= PAGE_SIZE*2);
+
+	f->hdr.num_longs = num_longs;
 	f->hdr.free = true;
 	f->hdr.prev_free = false;
-	*tailer(f) = f->hdr.num_longs;
 	list_head_init(&region->free_list);
 	list_add(&region->free_list, &f->list);
 #if POISON_MEM_REGION == 1
 	mem_poison(f);
 #endif
+
+	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
+	*t = num_longs;
+	vm_unmap((unsigned long)tailer(f), sizeof(long));
 }
 
 static void make_free(struct mem_region *region, struct free_hdr *f,
 		      const char *location, bool skip_poison)
 {
 	struct alloc_hdr *next;
+	unsigned long *t;
 
 #if POISON_MEM_REGION == 1
 	if (!skip_poison)
@@ -203,7 +228,9 @@ static void make_free(struct mem_region *region, struct free_hdr *f,
 	}
 
 	/* Fix up tailer. */
-	*tailer(f) = f->hdr.num_longs;
+	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
+	*t = f->hdr.num_longs;
+	vm_unmap((unsigned long)tailer(f), sizeof(long));
 
 	/* If next is free, coalesce it */
 	next = next_hdr(region, &f->hdr);
@@ -392,6 +419,7 @@ static void *__mem_alloc(struct mem_region *region, size_t size, size_t align,
 	size_t alloc_longs, offset;
 	struct free_hdr *f;
 	struct alloc_hdr *next;
+	unsigned long newsz;
 
 	/* Align must be power of 2. */
 	assert(!((align - 1) & align));
@@ -447,6 +475,17 @@ found:
 		next->prev_free = false;
 	}
 
+	newsz = ((void *)((unsigned long *)f + alloc_longs + offset) - region_start(region) + sizeof(struct free_hdr));
+	if (newsz > region->vm_mapped_len) {
+		/* TODO: unmap on free */
+		newsz += PAGE_SIZE-1;
+		newsz &= ~(PAGE_SIZE-1);
+		vm_map_global(region->name,
+			region->start + region->vm_mapped_len,
+			newsz - region->vm_mapped_len, true, false);
+		region->vm_mapped_len = newsz;
+	}
+
 	if (offset != 0) {
 		struct free_hdr *pre = f;
 
@@ -691,6 +730,7 @@ static struct mem_region *new_region(const char *name,
 	region->name = name;
 	region->start = start;
 	region->len = len;
+	region->vm_mapped_len = 0;
 	region->node = node;
 	region->type = type;
 	region->free_list.n.next = NULL;
@@ -1257,9 +1297,13 @@ static void mem_clear_range(uint64_t s, uint64_t e)
 		return;
 	}
 
-	prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
-	      (long long)s, (long long)e);
+	/*
+	 * Large clear thrashes the small hash table, with parallel clearing
+	 * this can livelock. Clear in real mode.
+	 */
+	vm_exit();
 	memset((void *)s, 0, e - s);
+	vm_enter();
 }
 
 struct mem_region_clear_job_args {
@@ -1273,7 +1317,8 @@ static void mem_region_clear_job(void *data)
 	mem_clear_range(arg->s, arg->e);
 }
 
-#define MEM_REGION_CLEAR_JOB_SIZE (16ULL*(1<<30))
+/* Limited by 256MB segment size (could fix) */
+#define MEM_REGION_CLEAR_JOB_SIZE (128ULL*(1<<20))
 
 static struct cpu_job **mem_clear_jobs;
 static struct mem_region_clear_job_args *mem_clear_job_args;
diff --git a/core/opal.c b/core/opal.c
index 1ae324b65..2420dedbe 100644
--- a/core/opal.c
+++ b/core/opal.c
@@ -44,19 +44,39 @@ static uint64_t opal_dynamic_events;
 extern uint32_t attn_trigger;
 extern uint32_t hir_trigger;
 
+void __opal_register(uint64_t token, void *func, unsigned int nargs)
+{
+	uint64_t f;
+	uint64_t *t;
+	u8 *a;
+
+	assert(token <= OPAL_LAST);
+
+	f = function_entry_address(func);
+
+	t = vm_map((unsigned long)&opal_branch_table[token], sizeof(*t), true);
+	*t = f;
+	vm_unmap((unsigned long)&opal_branch_table[token], sizeof(*t));
+
+	a = vm_map((unsigned long)&opal_num_args[token], sizeof(*a), true);
+	*a = nargs;
+	vm_unmap((unsigned long)&opal_num_args[token], sizeof(*a));
+}
 
 void opal_table_init(void)
 {
 	struct opal_table_entry *s = __opal_table_start;
 	struct opal_table_entry *e = __opal_table_end;
+	struct opal_table_entry *te;
+	size_t size = (void *)e - (void *)s;
 
 	prlog(PR_DEBUG, "OPAL table: %p .. %p, branch table: %p\n",
 	      s, e, opal_branch_table);
-	while(s < e) {
-		((uint64_t *)opal_branch_table)[s->token] = function_entry_address(s->func);
-		((u8 *)opal_num_args)[s->token] = s->nargs;
-		s++;
-	}
+
+	vm_map_global("OPAL table", (unsigned long)s, size, false, false);
+	for (te = s; te < e; te++)
+		__opal_register(te->token, te->func, te->nargs);
+	vm_unmap_global((unsigned long)s, size);
 }
 
 /* Called from head.S, thus no prototype */
@@ -319,14 +339,6 @@ int64_t opal_quiesce(uint32_t quiesce_type, int32_t cpu_target)
 }
 opal_call(OPAL_QUIESCE, opal_quiesce, 2);
 
-void __opal_register(uint64_t token, void *func, unsigned int nargs)
-{
-	assert(token <= OPAL_LAST);
-
-	((uint64_t *)opal_branch_table)[token] = function_entry_address(func);
-	((u8 *)opal_num_args)[token] = nargs;
-}
-
 /*
  * add_opal_firmware_exports_node: adds properties to the device-tree which
  * the OS will then change into sysfs nodes.
diff --git a/core/platform.c b/core/platform.c
index 2544f0ccf..728328e6e 100644
--- a/core/platform.c
+++ b/core/platform.c
@@ -233,8 +233,10 @@ void set_bmc_platform(const struct bmc_platform *bmc)
 
 void probe_platform(void)
 {
-	struct platform *platforms = &__platforms_start;
-	unsigned int i;
+	struct platform *s = __platforms_start;
+	struct platform *e = __platforms_end;
+	struct platform *p;
+	size_t size = (void *)e - (void *)s;
 
 	/* Detect Manufacturing mode */
 	if (dt_find_property(dt_root, "ibm,manufacturing-mode")) {
@@ -248,12 +250,15 @@ void probe_platform(void)
 		manufacturing_mode = true;
 	}
 
-	for (i = 0; &platforms[i] < &__platforms_end; i++) {
-		if (platforms[i].probe && platforms[i].probe()) {
-			platform = platforms[i];
+	vm_map_global("Platform table", (unsigned long)s, size, false, false);
+	for (p = s; p < e; p++) {
+		if (p->probe && p->probe()) {
+			platform = *p;
 			break;
 		}
 	}
+	vm_unmap_global((unsigned long)s, size);
+
 	if (!platform.name) {
 		platform = generic_platform;
 		if (platform.probe)
diff --git a/core/vm.c b/core/vm.c
new file mode 100644
index 000000000..09c441d3c
--- /dev/null
+++ b/core/vm.c
@@ -0,0 +1,899 @@
+/* Copyright 2018 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ccan/container_of/container_of.h>
+#include <ccan/list/list.h>
+#include <ccan/str/str.h>
+#include <cmpxchg.h>
+#include <cpu.h>
+#include <opal.h>
+#include <skiboot.h>
+#include <stack.h>
+#include <timebase.h>
+#include <trace.h>
+
+static bool vm_setup = false;
+static bool vm_globals_allocated = false;
+
+#define SLB_SZ		(256UL*1024*1024)
+#define SLB_NR		32
+#define LOCAL_SLB_NR	2
+#define GLOBAL_SLB_NR	(SLB_NR - LOCAL_SLB_NR)
+#define LOCAL_SLB_BASE	GLOBAL_SLB_NR
+
+#define LOCAL_EA_BEGIN	0x0001000000000000ULL
+#define LOCAL_EA_END	0x0002000000000000ULL
+
+static void __nomcount slb_install(unsigned long esid, unsigned long vsid, unsigned int index)
+{
+	unsigned long rs;
+	unsigned long rb;
+
+	rs = vsid << (63-51);		/* 256MB VSID */
+	rs |= 1UL << (63-53);		/* Kp = 1 */
+	if (PAGE_SIZE == 0x10000) {
+		rs |= 1UL << (63-55);		/* L = 1 */
+		rs |= 1UL << (63-59);		/* LP = 01 */
+	}
+
+	rb = esid << (63-35);		/* 256MB ESID */
+	rb |= 1UL << (63-36);		/* V = 1 */
+	rb |= index;
+
+	asm volatile("slbmte %0,%1" : : "r"(rs), "r"(rb) : "memory");
+}
+
+#if 0
+static void slb_remove(unsigned long esid)
+{
+	asm volatile("isync ; slbie %0 ; isync" : : "r"(esid << 28) : "memory");
+}
+#endif
+
+static void slb_remove_all(void)
+{
+	asm volatile("isync ; slbmte %0,%0 ; slbia ; isync" : : "r"(0) : "memory");
+}
+
+static void __nomcount slb_add(unsigned long ea)
+{
+	struct cpu_thread *cpu = this_cpu();
+	uint64_t esid = ea >> 28;
+	uint64_t vsid = ea >> 28;
+
+	slb_install(esid, vsid, cpu->vm_slb_rr);
+
+	cpu->vm_slb_rr++;
+	if (cpu->vm_slb_rr == GLOBAL_SLB_NR)
+		cpu->vm_slb_rr = 0;
+}
+
+struct hpte {
+	uint64_t dword[2];
+};
+
+struct hpteg {
+	struct hpte hpte[8];
+};
+
+static struct hpteg *htab;
+static unsigned long htab_shift;
+static unsigned long htab_pteg_mask;
+
+static struct lock htab_lock;
+
+static void __nomcount htab_install(unsigned long va, unsigned long pa, int rw, int ex, int ci, bool local)
+{
+	unsigned long hash;
+	struct hpteg *hpteg;
+	struct hpte *hpte;
+	unsigned long ava = va >> 23;
+	unsigned long arpn = pa >> 12;
+	unsigned long dw0, dw1;
+	unsigned long _dw0;
+	unsigned long _ava;
+	unsigned int hstart, hend;
+	unsigned int i;
+
+	if (PAGE_SIZE == 0x10000)
+		arpn >>= 4;
+
+	dw0 = ava << (63-56); /* AVA = ava */
+	dw0 |= 0x1; /* V = 1 */
+	if (PAGE_SIZE == 0x10000)
+		dw0 |= 0x4; /* L = 1 */
+	if (local)
+		dw0 |= 0x8; /* SW[0] = 1 */
+
+	if (PAGE_SIZE == 0x10000) {
+		dw1 = (arpn << (63-43 - 4)); /* ARPN||LP-4 = arpn */
+		dw1 |= (0x1 << (63-43 - 8)); /* LP = 0001 */
+	} else
+		dw1 = (arpn << (63-43 - 8)); /* ARPN||LP = arpn */
+	if (!rw)
+		dw1 |= (1UL << (63 - 0)) | (1UL << (63 - 63 + 1)); /* pp = 110 */
+	if (!ex)
+		dw1 |= (1UL << (63 - 61)); /* N = 1 */
+	dw1 |= (1UL << (63 - 60 + 1)); /* WIMG = 0010 */
+	if (ci)
+		dw1 |= (1UL << (63 - 60)) | (1UL << (63 - 60 + 2)); /* WIMG = 0111 */
+	dw1 |= (1UL << (63 - 55)) | (1UL << (63 - 56)); /* R=C=1 */
+
+	if (PAGE_SIZE == 0x10000)
+		hash = ((va >> 16) & 0xfff) ^ ((va >> 28) & 0x7fffffffffUL);
+	else
+		hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
+	hpteg = &htab[hash & htab_pteg_mask];
+
+	lock(&htab_lock);
+
+	hstart = 0;
+	hend = 7;
+
+	for (i = hstart; i <= hend; i++) {
+		hpte = &hpteg->hpte[i];
+
+		_dw0 = be64_to_cpu(hpte->dword[0]);
+		if (_dw0 & 1) {
+			_ava = _dw0 >> (63 - 56);
+			if (_ava == ava) {
+				/* Replace insertion */
+				goto install;
+			}
+
+			continue;
+		}
+
+		assert(!_dw0);
+		goto install;
+	}
+
+	i = mftb();
+	i = (i ^ (i >> 4)) & 0x7;
+	hpte = &hpteg->hpte[i];
+
+install:
+	hpte->dword[0] = 0;
+	eieio();
+	hpte->dword[1] = cpu_to_be64(dw1);
+	eieio();
+	hpte->dword[0] = cpu_to_be64(dw0);
+	asm volatile("ptesync" ::: "memory");
+	unlock(&htab_lock);
+}
+
+static void htab_remove(unsigned long va, int local)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	unsigned long hash;
+	struct hpteg *hpteg;
+	unsigned long ava = va >> 23;
+	unsigned long dw0;
+	unsigned long rb;
+	unsigned int hstart, hend;
+	unsigned int i;
+
+	dw0 = ava << (63-56);
+	dw0 |= 0x1;
+	if (PAGE_SIZE == 0x10000)
+		dw0 |= 0x4;
+	if (local)
+		dw0 |= 0x8;
+
+	if (PAGE_SIZE == 0x10000)
+		hash = ((va >> 16) & 0xfff) ^ ((va >> 28) & 0x7fffffffffUL);
+	else
+		hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
+	hpteg = &htab[hash & htab_pteg_mask];
+
+	if (vm_setup)
+		vm_exit();
+	if (!local)
+		lock(&htab_lock);
+again:
+	hstart = 0;
+	hend = 7;
+
+	for (i = hstart; i <= hend; i++) {
+		struct hpte *hpte = &hpteg->hpte[i];
+		unsigned long _raw_dw0, _dw0;
+
+		_raw_dw0 = hpte->dword[0];
+		_dw0 = be64_to_cpu(_raw_dw0);
+
+		if (!(_dw0 & 1)) {
+			assert(!_raw_dw0);
+			continue;
+		}
+
+		if (_dw0 != dw0)
+			continue;
+
+		if (local) {
+			if (__cmpxchg64(&hpte->dword[0], _raw_dw0, 0) != _raw_dw0)
+				goto again;
+		} else {
+			hpte->dword[0] = 0;
+		}
+
+		break;
+	}
+
+	if (PAGE_SIZE == 0x10000) {
+		rb = (va >> 16) << (63 - 47); /* AVA||LP-4 */
+		rb |= 0x1 << (63 - 51); /* LP=0001 */
+		rb |= 0x1; /* L=1 */
+	} else {
+		rb = va & ~0xfffUL;
+	}
+
+	if (local) {
+		asm volatile("ptesync" ::: "memory");
+		asm volatile("tlbiel %0" : : "r"(rb));
+		asm volatile("ptesync" ::: "memory");
+	} else {
+		unlock(&htab_lock);
+		asm volatile("ptesync" ::: "memory");
+		asm volatile("tlbie %0,%1" : : "r"(rb), "r"(0));
+		asm volatile("eieio ; tlbsync ; ptesync" ::: "memory");
+
+	}
+	if (vm_setup)
+		vm_enter();
+}
+
+/*
+ * Try to fix problems in callers if !strict.
+ */
+static bool vm_strict = false;
+
+static struct list_head vm_maps = LIST_HEAD_INIT(vm_maps);
+static struct lock vm_maps_lock;
+static unsigned long nr_vm_maps;
+
+static void __vm_map(const char *name, unsigned long addr, unsigned long len, unsigned long pa, bool r, bool w, bool x, bool ci, bool local)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	struct vm_map *new;
+	struct vm_map *vmm;
+
+	if (local) {
+		new = &c->vm_local_map;
+		new->name = name;
+		new->address = addr;
+		new->length = len;
+		new->pa = pa;
+		new->readable = r;
+		new->writeable = w;
+		new->executable = x;
+		new->ci = ci;
+
+		return;
+	}
+
+	new = zalloc(sizeof(*new));
+	assert(new);
+
+	new->name = name;
+	new->address = addr;
+	new->length = len;
+	new->pa = pa;
+	new->readable = r;
+	new->writeable = w;
+	new->executable = x;
+	new->ci = ci;
+
+	/* Can not take a d-side fault while holding this lock */
+	if (vm_setup)
+		vm_exit();
+	lock(&vm_maps_lock);
+
+	list_for_each(&vm_maps, vmm, list) {
+		if ((addr & ~(PAGE_SIZE - 1)) >= vmm->address + vmm->length)
+			continue;
+		if (((addr + len + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) <= vmm->address)
+			continue;
+		/* XXX: PA always matches for global 1:1, but should check */
+		assert(vmm->ci == ci);
+		if (vmm->readable != r)
+			printf("VMM: %s (%lx-%lx) mismatched read permissions with same page %s (%llx-%llx)\n", name, addr, addr + len, vmm->name, vmm->address, vmm->address + vmm->length);
+		if (vmm->writeable != w)
+			printf("VMM: %s (%lx-%lx) mismatched write permissions with same page %s (%llx-%llx)\n", name, addr, addr + len, vmm->name, vmm->address, vmm->address + vmm->length);
+		if (vmm->executable != x)
+			printf("VMM: %s (%lx-%lx) mismatched execute permissions with same page %s (%llx-%llx)\n", name, addr, addr + len, vmm->name, vmm->address, vmm->address + vmm->length);
+
+		if (!strcmp(name, vmm->name) && r == vmm->readable && w == vmm->readable && x == vmm->executable && ci == vmm->ci) {
+			if (addr == vmm->address + vmm->length &&
+					pa == vmm->pa + vmm->length) {
+				free(new);
+				vmm->length += len;
+				goto done;
+			}
+
+			if (addr + len == vmm->address &&
+					pa + len == vmm->pa) {
+				free(new);
+				vmm->address = addr;
+				vmm->pa = pa;
+				vmm->length += len;
+				goto done;
+			}
+		}
+
+		if (addr >= vmm->address + vmm->length)
+			continue;
+		if (addr + len <= vmm->address) {
+			list_add_before(&vm_maps, &new->list, &vmm->list);
+			goto found;
+		}
+
+		if (!vm_strict) {
+			prerror("vm_map_global %s %lx-%lx collided with vmm:%s %llx-%llx\n", name, addr, addr + len, vmm->name, vmm->address, vmm->address + vmm->length);
+			list_add_before(&vm_maps, &new->list, &vmm->list);
+			goto found;
+		}
+		assert(0);
+	}
+	list_add_tail(&vm_maps, &new->list);
+found:
+	nr_vm_maps++;
+done:
+	unlock(&vm_maps_lock);
+	if (vm_setup)
+		vm_enter();
+}
+
+static void __vm_unmap(unsigned long addr, unsigned long len, bool local)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	unsigned long end = addr + len;
+	struct vm_map *vmm;
+
+	if (local) {
+		vmm = &c->vm_local_map;
+		assert(addr == vmm->address);
+		assert(len == vmm->length);
+		memset(vmm, 0, sizeof(struct vm_map));
+
+		if (vm_setup) {
+			while (addr < end) {
+				htab_remove(addr, local);
+				addr += PAGE_SIZE;
+			}
+		}
+
+		return;
+	}
+
+	/* Can not take a d-side fault while holding this lock */
+	if (vm_setup)
+		vm_exit();
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		struct vm_map *new;
+
+		if (addr + len <= vmm->address)
+			continue;
+		if (addr >= vmm->address + vmm->length)
+			continue;
+		if (addr == vmm->address && len == vmm->length)
+			goto found;
+
+		if (addr == vmm->address) {
+			vmm->address += len;
+			vmm->pa += len;
+			vmm->length -= len;
+			goto done;
+		}
+
+		if (addr + len == vmm->address + vmm->length) {
+			vmm->length -= len;
+			goto done;
+		}
+
+		/* Unmaps will never span multiple because they always apply to a previous map, so this is a split */
+		new = zalloc(sizeof(*new));
+		assert(new);
+		memcpy(new, vmm, sizeof(*new));
+		list_add_before(&vm_maps, &new->list, &vmm->list);
+		nr_vm_maps++;
+
+		new->length = addr - new->address;
+		vmm->address += new->length + len;
+		vmm->pa += new->length + len;
+		vmm->length -= new->length + len;
+		goto done;
+	}
+	vmm = NULL;
+	unlock(&vm_maps_lock);
+	if (!vm_strict) {
+		prerror("unmap didn't find anything\n");
+		backtrace();
+		goto out;
+	}
+	assert(0);
+
+found:
+	list_del(&vmm->list);
+	nr_vm_maps--;
+done:
+	if (vm_setup) {
+		while (addr < end) {
+			htab_remove(addr, local);
+			addr += PAGE_SIZE;
+		}
+	}
+
+	unlock(&vm_maps_lock);
+out:
+	if (vm_setup)
+		vm_enter();
+
+	if (vmm)
+		free(vmm);
+}
+
+
+void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci)
+{
+	__vm_map(name, addr, len, addr, true, rw, false, ci, false);
+}
+
+void vm_map_global_text(const char *name, unsigned long addr, unsigned long len)
+{
+	__vm_map(name, addr, len, addr, true, false, true, false, false);
+}
+
+void vm_unmap_global(unsigned long addr, unsigned long len)
+{
+	__vm_unmap(addr, len, false);
+}
+
+
+void *vm_map(unsigned long addr, unsigned long len, bool rw)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long newaddr = (LOCAL_EA_BEGIN + ((unsigned long)c->pir << 30));
+	unsigned long end = addr + len;
+	unsigned long offset = addr & (PAGE_SIZE - 1);
+
+	/* Can't do nested mappings */
+	assert(!c->vm_local_map_inuse);
+	c->vm_local_map_inuse = true;
+
+	if (!c->vm_setup)
+		return (void *)addr;
+
+	end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	assert(len < (1 << 28)); /* same segment */
+
+	__vm_map("local", newaddr, len, addr, true, rw, false, false, true);
+
+	return (void *)newaddr + offset;
+}
+
+void vm_unmap(unsigned long addr, unsigned long len)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long newaddr = (LOCAL_EA_BEGIN + ((unsigned long)c->pir << 30));
+	unsigned long end = addr + len;
+
+	assert(c->vm_local_map_inuse);
+	c->vm_local_map_inuse = false;
+
+	if (!c->vm_setup)
+		return;
+
+	end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	assert(len < (1 << 28)); /* same segment */
+
+	__vm_unmap(newaddr, len, true);
+}
+
+struct prte {
+	unsigned long dword[2];
+};
+
+static struct prte *prtab;
+
+static void vm_init_cpu(void)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long esid = (LOCAL_EA_BEGIN + ((unsigned long)c->pir << 30)) >> 28;
+	unsigned long vsid = (LOCAL_EA_BEGIN + ((unsigned long)c->pir << 30)) >> 28;
+
+	mtspr(SPR_LPCR, (mfspr(SPR_LPCR)
+		& ~(PPC_BITMASK(0,3) | PPC_BIT(41) | PPC_BIT(43)))
+		| PPC_BIT(54));
+
+	mtspr(SPR_LPID, 0);
+	mtspr(SPR_PID, 0);
+	mtspr(SPR_HRMOR, 0);
+	mtspr(SPR_PTCR, (unsigned long)prtab);
+	mtspr(SPR_AMR, 0);
+	mtspr(SPR_IAMR, 0);
+	mtspr(SPR_AMOR, 0);
+	mtspr(SPR_UAMOR, 0);
+
+	slb_remove_all();
+	slb_install(esid, vsid, LOCAL_SLB_BASE);
+}
+
+void vm_init_secondary(void)
+{
+	vm_init_cpu();
+	vm_enter();
+}
+
+bool vm_realmode(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	return !vm_setup || !c->vm_setup;
+}
+
+void vm_enter(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	assert(vm_setup);
+	if (c->vm_setup) {
+		mtmsr(mfmsr() | (MSR_IR|MSR_DR));
+		prerror("CPU:%d vm_enter already entered\n", c->pir);
+		backtrace();
+		return;
+	}
+	c->vm_setup = true;
+	mtmsr(mfmsr() | (MSR_IR|MSR_DR));
+}
+
+void vm_exit(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	assert(vm_setup);
+	if (!c->vm_setup) {
+		mtmsr(mfmsr() & ~(MSR_IR|MSR_DR));
+		prerror("CPU:%d vm_exit already exited\n", c->pir);
+		backtrace();
+		return;
+	}
+	c->vm_setup = false;
+	mtmsr(mfmsr() & ~(MSR_IR|MSR_DR));
+}
+
+bool __nomcount vm_dslb(uint64_t nia, uint64_t dar)
+{
+	/*
+	 * Per-cpu map ranges are bolted to per-cpu SLBs.
+	 */
+	assert((dar < LOCAL_EA_BEGIN) ||
+		(dar >= LOCAL_EA_END));
+
+	(void)nia;
+	slb_add(dar);
+
+	return true;
+}
+
+bool __nomcount vm_islb(uint64_t nia)
+{
+	slb_add(nia);
+
+	return true;
+}
+
+bool __nomcount vm_dsi(uint64_t nia, uint64_t dar, bool store)
+{
+	struct cpu_thread *c = this_cpu();
+	struct vm_map *vmm;
+	uint64_t pa;
+	bool ret = true;
+	bool local;
+
+	(void)nia;
+
+	if ((dar >= LOCAL_EA_BEGIN) && (dar < LOCAL_EA_END)) {
+		local = true;
+		vmm = &c->vm_local_map;
+		if (dar >= vmm->address && dar < vmm->address + vmm->length)
+			goto found;
+		goto not_found;
+	}
+
+	local = false;
+
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		assert(vmm->pa == vmm->address);
+		if (dar >= vmm->address && dar < vmm->address + vmm->length)
+			goto found;
+	}
+	if (!vm_strict) {
+		if (dar >= 0x0006000000000000 && dar < 0x0007000000000000)
+			/* MMIO */
+			htab_install(dar, dar, 1, 0, 1, false);
+		else if (dar < LOCAL_EA_BEGIN)
+			htab_install(dar, dar, 1, 0, 0, false);
+		else
+			ret = false;
+		unlock(&vm_maps_lock);
+		prerror("Page fault with no VMM at NIA:0x%016llx DAR:0x%016llx, store:%d\n", nia, dar, store);
+		backtrace();
+		list_for_each(&vm_maps, vmm, list)
+			prlog(PR_DEBUG, "%28s 0x%08llx-0x%08llx\n", vmm->name,
+				vmm->address, vmm->address + vmm->length);
+		goto out;
+	}
+	unlock(&vm_maps_lock);
+not_found:
+	prerror("  vmm not found\n");
+	ret = false;
+	assert(0);
+	goto out;
+
+found:
+	pa = vmm->pa + (dar & ~(PAGE_SIZE - 1)) - vmm->address;
+	if (!vmm->readable) {
+		if (!local)
+			unlock(&vm_maps_lock);
+		prerror("  vmm not readable\n");
+		ret = false;
+		assert(0);
+		goto out;
+	}
+	if (store && !vmm->writeable) {
+		if (!vm_strict) {
+			htab_install(dar, pa, store, 0, vmm->ci, local);
+			if (!local)
+				unlock(&vm_maps_lock);
+			prerror("Page fault store to RO VMM:%s at NIA:0x%016llx DAR:0x%016llx\n", vmm->name, nia, dar);
+			backtrace();
+			goto out;
+		}
+		if (!local)
+			unlock(&vm_maps_lock);
+		prerror("  vmm not writeable\n");
+		ret = false;
+		assert(0);
+		goto out;
+	}
+
+	htab_install(dar, pa, vmm->writeable, vmm->executable, vmm->ci, local);
+	if (!local)
+		unlock(&vm_maps_lock);
+
+out:
+	return ret;
+}
+
+bool __nomcount vm_isi(uint64_t nia)
+{
+	struct vm_map *vmm;
+
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		assert(vmm->pa == vmm->address);
+		if (nia >= vmm->address && nia < vmm->address + vmm->length) {
+			if (!vmm->executable)
+				prerror("Page fault at NIA:0x%016llx NX mapping!\n", nia);
+			goto found;
+		}
+	}
+
+	prerror("Page fault, no mapping for NIA:0x%016llx !\n", nia);
+
+found:
+	unlock(&vm_maps_lock);
+	htab_install(nia, nia, 0, 1, 0, false);
+
+	return true;
+}
+
+static void cpu_stop_vm(void *arg __unused)
+{
+	vm_exit();
+}
+
+static void cpu_cleanup_vm(void *arg __unused)
+{
+	slb_remove_all();
+	mtspr(SPR_PTCR, 0);
+}
+
+static void cpu_all_destroy_vm(void)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job **jobs;
+
+	jobs = zalloc(sizeof(struct cpu_job *) * cpu_max_pir + 1);
+	assert(jobs);
+
+	/* Stop all CPUs */
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_stop_vm",
+						cpu_stop_vm, NULL);
+	}
+
+	/* this cpu */
+	cpu_stop_vm(NULL);
+
+	/* Cleaup after all stop */
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_cleanup_vm",
+						cpu_cleanup_vm, NULL);
+	}
+
+	/* this cpu */
+	cpu_cleanup_vm(NULL);
+
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	free(jobs);
+}
+
+void vm_init(bool fast_reboot)
+{
+	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
+	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
+	unsigned long htab_nr_bytes;
+	unsigned long htab_nr_ptegs;
+
+	prtab = memalign(64*1024, 64*1024);
+	assert(prtab);
+	memset(prtab, 0, 64*1024);
+
+	htab_shift = 18; /* 256kB table */
+	htab_nr_bytes = 1UL << htab_shift;
+	htab_nr_ptegs = htab_nr_bytes / sizeof(struct hpteg);
+	htab_pteg_mask = htab_nr_ptegs - 1;
+	htab = memalign(1UL << htab_shift, htab_nr_bytes);
+	assert(htab);
+	memset(htab, 0, htab_nr_bytes);
+
+	prtab[0].dword[0] = cpu_to_be64((unsigned long)htab | (htab_shift - 18));
+	prtab[0].dword[1] = 0;
+
+	eieio();
+
+	vm_init_cpu();
+
+	cleanup_global_tlb();
+
+	if (vm_globals_allocated) {
+		assert(fast_reboot);
+		goto done;
+	}
+
+	assert(!fast_reboot);
+	vm_globals_allocated = true;
+
+	vm_map_global_text("OPAL text", (unsigned long)_stext,
+			   (unsigned long)_etext - (unsigned long)_stext);
+	vm_map_global("OPAL rodata", (unsigned long)__rodata_start,
+		      (unsigned long)__vm_mapped_romem_end - (unsigned long)__rodata_start,
+		      false, false);
+	vm_map_global("OPAL data", (unsigned long)_sdata,
+		      (unsigned long)_edata - (unsigned long)_sdata,
+		      true, false);
+	vm_map_global("OPAL bss", (unsigned long)_sbss,
+		      (unsigned long)_ebss - (unsigned long)_sbss,
+		      true, false);
+	vm_map_global("OPAL heap", HEAP_BASE, HEAP_SIZE, true, false);
+	vm_map_global("Memory console", INMEM_CON_START, INMEM_CON_LEN, true, false);
+	vm_map_global("Hostboot console", HBRT_CON_START, HBRT_CON_LEN, false, false);
+	vm_map_global("SPIRA heap", SPIRA_HEAP_BASE, SPIRA_HEAP_SIZE, false, false);
+	vm_map_global("PSI TCE table", PSI_TCE_TABLE_BASE, PSI_TCE_TABLE_SIZE, false, false);
+	vm_map_global("OPAL boot stacks", stack_start, stack_end - stack_start, true, false);
+
+done:
+	if (1) {
+		struct vm_map *vmm;
+		prlog(PR_DEBUG, "VMM: SETUP\n");
+		prlog(PR_DEBUG, " PRTAB:%p\n", prtab);
+		prlog(PR_DEBUG, " HTAB: %p\n", htab);
+		prlog(PR_DEBUG, " %lu Global mappings\n", nr_vm_maps);
+		list_for_each(&vm_maps, vmm, list)
+			prlog(PR_DEBUG, "%28s 0x%08llx-0x%08llx\n", vmm->name,
+				vmm->address, vmm->address + vmm->length);
+	}
+
+	vm_setup = true;
+
+	vm_enter();
+}
+
+void vm_init_stacks(void)
+{
+	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
+	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
+	struct cpu_thread *c = this_cpu();
+	struct vm_map *vmm;
+
+	/* Can not take a d-side fault while holdig this lock */
+	if (c->vm_setup)
+		mtmsr(mfmsr() & ~MSR_DR);
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		if (vmm->address >= stack_end)
+			continue;
+		if (vmm->address + vmm->length <= stack_start)
+			continue;
+		goto found;
+	}
+	unlock(&vm_maps_lock);
+	assert(0);
+
+found:
+	vmm->name = "OPAL stacks";
+	vmm->address = stack_start;
+	vmm->length = stack_end - stack_start;
+	unlock(&vm_maps_lock);
+	if (c->vm_setup)
+		mtmsr(mfmsr() | MSR_DR);
+}
+
+void vm_destroy(void)
+{
+	assert(vm_setup);
+
+	if (1) {
+		struct vm_map *vmm;
+		prlog(PR_DEBUG, "VMM: TEARDOWN\n");
+		prlog(PR_DEBUG, " %lu Global mappings\n", nr_vm_maps);
+		list_for_each(&vm_maps, vmm, list)
+			prlog(PR_DEBUG, "%28s 0x%08llx-0x%08llx\n", vmm->name,
+				vmm->address, vmm->address + vmm->length);
+	}
+
+	cpu_all_destroy_vm();
+
+	vm_setup = false;
+
+	if (0) { /* XXX: leave for VMM enabled fast-reboot */
+		while (!list_empty(&vm_maps)) {
+			struct vm_map *vmm;
+			vmm = list_pop(&vm_maps, struct vm_map, list);
+			free(vmm);
+		}
+	}
+
+	free(htab);
+	htab = NULL;
+	free(prtab);
+	prtab = NULL;
+}
diff --git a/hdata/spira.c b/hdata/spira.c
index 77c937b10..f11bec381 100644
--- a/hdata/spira.c
+++ b/hdata/spira.c
@@ -1704,11 +1704,20 @@ static void fixup_spira(void)
 static void update_spirah_addr(void)
 {
 #if !defined(TEST)
+	uint64_t *spirah_offset;
+	uint64_t *spira_offset;
+
 	if (proc_gen < proc_gen_p9)
 		return;
 
-	naca.spirah_addr = CPU_TO_BE64(SPIRAH_OFF);
-	naca.spira_addr = CPU_TO_BE64(SPIRA_OFF);
+	spirah_offset = vm_map((u64)&naca, sizeof(u64), true);
+	*spirah_offset = CPU_TO_BE64(SPIRAH_OFF);
+	vm_unmap((unsigned long)spirah_offset, sizeof(u64));
+
+	spira_offset = vm_map((u64)&naca + 0x30, sizeof(u64), true);
+	*spira_offset = CPU_TO_BE64(SPIRA_OFF);
+	vm_unmap((unsigned long)spira_offset, sizeof(u64));
+
 	spirah.ntuples.hs_data_area.addr = CPU_TO_BE64(SPIRA_HEAP_BASE - SKIBOOT_BASE);
 	spirah.ntuples.mdump_res.addr = CPU_TO_BE64(MDRT_TABLE_BASE - SKIBOOT_BASE);
 #endif
@@ -1716,13 +1725,24 @@ static void update_spirah_addr(void)
 
 int parse_hdat(bool is_opal)
 {
+	int ret = 0;
+
 	cpu_type = PVR_TYPE(mfspr(SPR_PVR));
 
 	prlog(PR_DEBUG, "Parsing HDAT...\n");
 
+	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), true, false);
 	fixup_spira();
+	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
 
+	vm_map_global("SPIRA-H", SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah), true, false);
 	update_spirah_addr();
+	vm_unmap_global(SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah));
+
+	/* Downgrade to read-only */
+
+	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), false, false);
+	vm_map_global("SPIRA-H", SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah), false, false);
 
 	/*
 	 * Basic DT root stuff
@@ -1743,8 +1763,10 @@ int parse_hdat(bool is_opal)
 	dt_init_led_node();
 
 	/* Parse PCIA */
-	if (!pcia_parse())
-		return -1;
+	if (!pcia_parse()) {
+		ret = -1;
+		goto out;
+	}
 
 	/* IPL params */
 	add_iplparams();
@@ -1790,6 +1812,9 @@ int parse_hdat(bool is_opal)
 		node_stb_parse();
 
 	prlog(PR_DEBUG, "Parsing HDAT...done\n");
+out:
+	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
+	vm_unmap_global(SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah));
 
-	return 0;
+	return ret;
 }
diff --git a/hw/fake-nvram.c b/hw/fake-nvram.c
index 6411400eb..bac13a3de 100644
--- a/hw/fake-nvram.c
+++ b/hw/fake-nvram.c
@@ -23,12 +23,16 @@ int fake_nvram_info(uint32_t *total_size)
 
 int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
 {
+	void *t;
+
 	if (!nvram_region)
 		return -ENODEV;
 
+	t = vm_map(nvram_region->start + src, len, false);
 	lock(&fake_nvram_lock);
-	memcpy(dst, (void *) (nvram_region->start + src), len);
+	memcpy(dst, t, len);
 	unlock(&fake_nvram_lock);
+	vm_unmap(nvram_region->start + src, len);
 
 	nvram_read_complete(true);
 
@@ -37,12 +41,16 @@ int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
 
 int fake_nvram_write(uint32_t offset, void *src, uint32_t size)
 {
+	void *t;
+
 	if (!nvram_region)
 		return OPAL_HARDWARE;
 
+	t = vm_map(nvram_region->start + offset, size, true);
 	lock(&fake_nvram_lock);
-	memcpy((void *) (nvram_region->start + offset), src, size);
+	memcpy(t, src, size);
 	unlock(&fake_nvram_lock);
+	vm_unmap(nvram_region->start + offset, size);
 
 	return 0;
 }
diff --git a/hw/homer.c b/hw/homer.c
index 96a01fdbc..eb23e8118 100644
--- a/hw/homer.c
+++ b/hw/homer.c
@@ -108,6 +108,9 @@ static void homer_init_chip(struct proc_chip *chip)
 
 		chip->homer_base = hbase;
 		chip->homer_size = hsize;
+		/* slw late init and xive late init want to write to HOMER */
+		/* XXX: make it read only until then? */
+		vm_map_global("HOMER Image", hbase, hsize, true, false);
 	}
 
 	/*
@@ -134,13 +137,21 @@ static void homer_init_chip(struct proc_chip *chip)
 		chip->slw_base = sbase;
 		chip->slw_bar_size = ssize;
 		chip->slw_image_size = ssize; /* will be adjusted later */
+		/* XXX */
 	}
 
 	if (read_pba_bar(chip, bar_occ_common, &obase, &osize)) {
-		prlog(PR_DEBUG, "  OCC Common Area at 0x%llx size %lldMB\n",
-		      obase, osize / 0x100000);
+		static uint64_t homer_obase = 0;
+
 		chip->occ_common_base = obase;
 		chip->occ_common_size = osize;
+
+		prlog(PR_DEBUG, "  OCC Common Area at 0x%llx size %lldMB\n",
+		      obase, osize / 0x100000);
+		if (obase != homer_obase) {
+			vm_map_global("OCC Common Area", obase, osize, false, false);
+			homer_obase = obase;
+		}
 	}
 }
 
diff --git a/hw/lpc-uart.c b/hw/lpc-uart.c
index b37e04201..eada52e06 100644
--- a/hw/lpc-uart.c
+++ b/hw/lpc-uart.c
@@ -59,7 +59,7 @@ static uint32_t uart_base;
 static bool has_irq = false, irq_ok, rx_full, tx_full;
 static uint8_t tx_room;
 static uint8_t cached_ier;
-static void *mmio_uart_base;
+void *mmio_uart_base;
 static int uart_console_policy = UART_CONSOLE_OPAL;
 static int lpc_irq = -1;
 
@@ -591,6 +591,8 @@ void early_uart_init(void)
 	if (!mmio_uart_base)
 		return;
 
+	vm_map_global("UART MMIO", (unsigned long)mmio_uart_base, 8, true, true);
+
 	clk = dt_prop_get_u32(uart_node, "clock-frequency");
 	baud = dt_prop_get_u32(uart_node, "current-speed");
 
@@ -599,6 +601,7 @@ void early_uart_init(void)
 		prlog(PR_DEBUG, "UART: Using UART at %p\n", mmio_uart_base);
 	} else {
 		prerror("UART: Early init failed!");
+		vm_unmap_global((unsigned long)mmio_uart_base, 8);
 		mmio_uart_base = NULL;
 	}
 }
@@ -610,9 +613,6 @@ void uart_init(void)
 	char *path __unused;
 	const be32 *irqp;
 
-	/* Clean up after early_uart_init() */
-	mmio_uart_base = NULL;
-
 	/* UART lock is in the console path and thus must block
 	 * printf re-entrancy
 	 */
@@ -630,13 +630,28 @@ void uart_init(void)
 	 * directly mapped UARTs in simulation environments
 	 */
 	if (n->parent == dt_root) {
+		void *base;
+
 		printf("UART: Found at root !\n");
-		mmio_uart_base = (void *)dt_translate_address(n, 0, NULL);
-		if (!mmio_uart_base) {
+
+		base = (void *)dt_translate_address(n, 0, NULL);
+		if (!base) {
 			printf("UART: Failed to translate address !\n");
 			return;
 		}
 
+		if (mmio_uart_base != base) {
+			void *old;
+
+			vm_map_global("UART MMIO", (unsigned long)base, 8, true, true);
+			old = mmio_uart_base;
+			mmio_uart_base = base;
+
+			/* Clean up after early_uart_init() */
+			if (old)
+				vm_unmap_global((unsigned long)old, 8);
+		}
+
 		/* If it has an interrupt properly, we consider this to be
 		 * a direct XICS/XIVE interrupt
 		 */
@@ -665,6 +680,11 @@ void uart_init(void)
 			lpc_irq = be32_to_cpu(*irqp);
 			prlog(PR_DEBUG, "UART: Using LPC IRQ %d\n", lpc_irq);
 		}
+
+		if (mmio_uart_base) {
+//			vm_unmap_global((unsigned long)mmio_uart_base, 8);
+			mmio_uart_base = NULL;
+		}
 	}
 
 
diff --git a/hw/lpc.c b/hw/lpc.c
index abf549746..89b7a367f 100644
--- a/hw/lpc.c
+++ b/hw/lpc.c
@@ -1239,6 +1239,7 @@ static void lpc_init_chip_p8(struct dt_node *xn)
 	chip->lpc = lpc;
 }
 
+void *mmio_uart_base;
 static void lpc_init_chip_p9(struct dt_node *opb_node)
 {
 	uint32_t gcid = dt_get_chip_id(opb_node);
@@ -1261,6 +1262,11 @@ static void lpc_init_chip_p9(struct dt_node *opb_node)
 	if (!lpc_node)
 		return;
 
+
+	if (mmio_uart_base)
+		vm_unmap_global((unsigned long)mmio_uart_base, 8);
+	vm_map_global("LPC MMIO", addr, 0x100000000UL /* XXX: size? */, true, true);
+
 	lpc = zalloc(sizeof(struct lpcm));
 	assert(lpc);
 	lpc->chip_id = gcid;
diff --git a/hw/phb4.c b/hw/phb4.c
index ed7f4e5c4..2f98356f6 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -5825,6 +5825,7 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	uint64_t val, phb_bar = 0, irq_bar = 0, bar_en;
 	uint64_t mmio0_bar = 0, mmio0_bmask, mmio0_sz;
 	uint64_t mmio1_bar = 0, mmio1_bmask, mmio1_sz;
+	uint64_t bar_sz;
 	void *foo;
 	__be64 mmio_win[4];
 	unsigned int mmio_win_sz;
@@ -5853,7 +5854,8 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	bar_en = 0;
 
 	/* Initialize PHB register BAR */
-	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, NULL);
+	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, &bar_sz);
+	vm_map_global("PHB REGS", phb_bar, bar_sz, true, true);
 	rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
 			 phb_bar << 8);
 
@@ -5867,18 +5869,21 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
 
 	/* Same with INT BAR (ESB) */
-	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, NULL);
+	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, &bar_sz);
+	vm_map_global("PHB IRQ", irq_bar, bar_sz, true, true);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_IRQ_BAR, irq_bar << 8);
 	bar_en |= XPEC_NEST_STK_BAR_EN_INT;
 
 
 	/* Same with MMIO windows */
 	phys_map_get(gcid, PHB4_64BIT_MMIO, phb_num, &mmio0_bar, &mmio0_sz);
+	vm_map_global("PHB MMIO0", mmio0_bar, mmio0_sz, true, true);
 	mmio0_bmask =  (~(mmio0_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0, mmio0_bar << 8);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0_MASK, mmio0_bmask << 8);
 
 	phys_map_get(gcid, PHB4_32BIT_MMIO, phb_num, &mmio1_bar, &mmio1_sz);
+	vm_map_global("PHB MMIO1", mmio1_bar, mmio1_sz, true, true);
 	mmio1_bmask =  (~(mmio1_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1, mmio1_bar << 8);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1_MASK, mmio1_bmask << 8);
diff --git a/hw/psi.c b/hw/psi.c
index eede4e5b4..6c7794119 100644
--- a/hw/psi.c
+++ b/hw/psi.c
@@ -908,6 +908,8 @@ static bool psi_init_psihb(struct dt_node *psihb)
 
 	list_add(&psis, &psi->list);
 
+	vm_map_global("PSI", (unsigned long)psi->regs, 0x100, true, true);
+
 	val = in_be64(psi->regs + PSIHB_CR);
 	if (val & PSIHB_CR_FSP_LINK_ACTIVE) {
 		lock(&psi_lock);
diff --git a/hw/slw.c b/hw/slw.c
index 2f7619793..0267d3c30 100644
--- a/hw/slw.c
+++ b/hw/slw.c
@@ -151,7 +151,7 @@ static void slw_patch_reset(void)
 		*(sav++) = *(dst);
 		*(dst++) = *(src++);
 	}
-	sync_icache();
+	sync_icache(0);
 }
 
 static void slw_unpatch_reset(void)
@@ -167,7 +167,7 @@ static void slw_unpatch_reset(void)
 		*(dst++) = *(sav++);
 		src++;
 	}
-	sync_icache();
+	sync_icache(0);
 }
 
 static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
diff --git a/hw/xive.c b/hw/xive.c
index 41575dae7..f04164a22 100644
--- a/hw/xive.c
+++ b/hw/xive.c
@@ -1397,6 +1397,7 @@ static bool xive_configure_bars(struct xive *x)
 
 	/* IC BAR */
 	phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size);
+	vm_map_global("XIVE IC", (unsigned long)x->ic_base, x->ic_size, true, true);
 	val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID;
 	if (IC_PAGE_SIZE == 0x10000) {
 		val |= CQ_IC_BAR_64K;
@@ -1412,6 +1413,8 @@ static bool xive_configure_bars(struct xive *x)
 	 * all phys_map_get(XIVE_TM) calls.
 	 */
 	phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size);
+	if (chip_id == 0)
+		vm_map_global("XIVE TM", (unsigned long)x->tm_base, x->tm_size, true, true);
 	val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID;
 	if (TM_PAGE_SIZE == 0x10000) {
 		x->tm_shift = 16;
@@ -1427,6 +1430,7 @@ static bool xive_configure_bars(struct xive *x)
 
 	/* PC BAR. Clear first, write mask, then write value */
 	phys_map_get(chip_id, XIVE_PC, 0, (uint64_t *)&x->pc_base, &x->pc_size);
+	vm_map_global("XIVE PC", (unsigned long)x->pc_base, x->pc_size, true, true);
 	xive_regwx(x, CQ_PC_BAR, 0);
 	if (x->last_reg_error)
 		return false;
@@ -1441,6 +1445,7 @@ static bool xive_configure_bars(struct xive *x)
 
 	/* VC BAR. Clear first, write mask, then write value */
 	phys_map_get(chip_id, XIVE_VC, 0, (uint64_t *)&x->vc_base, &x->vc_size);
+	vm_map_global("XIVE VC", (unsigned long)x->vc_base, x->vc_size, true, true);
 	xive_regwx(x, CQ_VC_BAR, 0);
 	if (x->last_reg_error)
 		return false;
diff --git a/hw/xscom.c b/hw/xscom.c
index a85169598..67808a18e 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -860,6 +860,7 @@ void xscom_init(void)
 		const struct dt_property *reg;
 		struct proc_chip *chip;
 		const char *chip_name;
+		u64 size;
 		static const char *chip_names[] = {
 			"UNKNOWN", "P8E", "P8", "P8NVL", "P9N", "P9C", "P9P"
 		};
@@ -874,6 +875,9 @@ void xscom_init(void)
 		assert(reg);
 
 		chip->xscom_base = dt_translate_address(xn, 0, NULL);
+		size = dt_property_get_u64(reg, 1);
+
+		vm_map_global("XSCOM MMIO", chip->xscom_base, size, true, true);
 
 		/* Grab processor type and EC level */
 		xscom_init_chip_info(chip);
diff --git a/include/cmpxchg.h b/include/cmpxchg.h
index 3541a41f4..5d518ece6 100644
--- a/include/cmpxchg.h
+++ b/include/cmpxchg.h
@@ -5,6 +5,9 @@
 #define __CMPXCHG_H
 
 #ifndef __TEST__
+#include <stdint.h>
+#include <processor.h>
+
 /*
  * Bare cmpxchg, no barriers.
  */
diff --git a/include/cpu.h b/include/cpu.h
index f44a828b1..c5962a6ee 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -12,6 +12,19 @@
 #include <stack.h>
 #include <timer.h>
 
+struct vm_map {
+	struct list_node list;
+
+	const char *name;
+	uint64_t address;
+	uint64_t pa;
+	uint64_t length;
+	bool readable;
+	bool writeable;
+	bool executable;
+	bool ci;
+};
+
 /*
  * cpu_thread is our internal structure representing each
  * thread in the system
@@ -71,10 +84,19 @@ struct cpu_thread {
 	struct bt_entry			stack_bot_bt[CPU_BACKTRACE_SIZE];
 	struct bt_metadata		stack_bot_bt_metadata;
 #endif
+	/*
+	 * Per-thread VM parameters
+	 */
+	struct vm_map			vm_local_map; /* per-cpu map */
+	bool				vm_local_map_inuse;
+	uint8_t				vm_slb_rr; /* RR allocator */
+	bool				vm_setup; /* virtual memory is up */
+
 	struct lock			job_lock;
 	struct list_head		job_queue;
 	uint32_t			job_count;
 	bool				job_has_no_return;
+
 	/*
 	 * Per-core mask tracking for threads in HMI handler and
 	 * a cleanup done bit.
diff --git a/include/elf-abi.h b/include/elf-abi.h
index 827f2af19..f0cb87ec2 100644
--- a/include/elf-abi.h
+++ b/include/elf-abi.h
@@ -21,7 +21,16 @@
 static inline uint64_t function_entry_address(void *func)
 {
 #ifdef ELF_ABI_v2
-	u32 *insn = func;
+	u32 *ret = func;
+	u32 *i;
+	u32 insn;
+	u32 insn2;
+
+	i = vm_map((unsigned long)func, sizeof(insn)*2, false);
+	insn = *i;
+	insn2 = *(i+1);
+	vm_unmap((unsigned long)func, sizeof(insn)*2);
+
 	/*
 	 * A PPC64 ABIv2 function may have a local and a global entry
 	 * point. We use the local entry point for branch tables called
@@ -38,12 +47,12 @@ static inline uint64_t function_entry_address(void *func)
 	 * lis   r2,XXXX
 	 * addi  r2,r2,XXXX
 	 */
-	if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
-	     ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
-	    ((*(insn+1) & OP_RT_RA_MASK) == ADDI_R2_R2))
-		return (uint64_t)(insn + 2);
+	if ((((insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
+	     ((insn & OP_RT_RA_MASK) == LIS_R2)) &&
+	    ((insn2 & OP_RT_RA_MASK) == ADDI_R2_R2))
+		return (uint64_t)(ret + 2);
 	else
-		return (uint64_t)func;
+		return (uint64_t)ret;
 #else
 	return *(uint64_t *)func;
 #endif
diff --git a/include/io.h b/include/io.h
index 57dddd49f..d32990a6b 100644
--- a/include/io.h
+++ b/include/io.h
@@ -7,6 +7,7 @@
 #ifndef __ASSEMBLY__
 
 #include <compiler.h>
+#include <skiboot.h>
 #include <stdint.h>
 #include <processor.h>
 #include <types.h>
@@ -23,8 +24,14 @@
 static inline uint8_t __in_8(const volatile uint8_t *addr)
 {
 	uint8_t val;
-	asm volatile("lbzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lbzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lbzx %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return val;
 }
 
@@ -37,8 +44,14 @@ static inline uint8_t in_8(const volatile uint8_t *addr)
 static inline uint16_t __in_be16(const volatile uint16_t *addr)
 {
 	__be16 val;
-	asm volatile("lhzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lhzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lhzx %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return be16_to_cpu(val);
 }
 
@@ -51,8 +64,14 @@ static inline uint16_t in_be16(const volatile uint16_t *addr)
 static inline uint16_t __in_le16(const volatile uint16_t *addr)
 {
 	__le16 val;
-	asm volatile("lhzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lhzcix %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lhzx %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return le16_to_cpu(val);
 }
 
@@ -65,8 +84,14 @@ static inline uint16_t in_le16(const volatile uint16_t *addr)
 static inline uint32_t __in_be32(const volatile uint32_t *addr)
 {
 	__be32 val;
-	asm volatile("lwzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lwzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lwzx %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return be32_to_cpu(val);
 }
 
@@ -79,8 +104,14 @@ static inline uint32_t in_be32(const volatile uint32_t *addr)
 static inline uint32_t __in_le32(const volatile uint32_t *addr)
 {
 	__le32 val;
-	asm volatile("lwzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lwzcix %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lwzx %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return le32_to_cpu(val);
 }
 
@@ -93,8 +124,14 @@ static inline uint32_t in_le32(const volatile uint32_t *addr)
 static inline uint64_t __in_be64(const volatile uint64_t *addr)
 {
 	__be64 val;
-	asm volatile("ldcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("ldcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else /* XXX: else case could just be a normal load/store with endian flip which should compile better */
+		asm volatile("ldx %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return be64_to_cpu(val);
 }
 
@@ -107,8 +144,14 @@ static inline uint64_t in_be64(const volatile uint64_t *addr)
 static inline uint64_t __in_le64(const volatile uint64_t *addr)
 {
 	__le64 val;
-	asm volatile("ldcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("ldcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("ldx %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return le64_to_cpu(val);
 }
 
@@ -120,7 +163,11 @@ static inline uint64_t in_le64(const volatile uint64_t *addr)
 
 static inline void __out_8(volatile uint8_t *addr, uint8_t val)
 {
-	asm volatile("stbcix %0,0,%1"
+	if (vm_realmode())
+		asm volatile("stbcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stbx %0,0,%1"
 		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
@@ -132,8 +179,13 @@ static inline void out_8(volatile uint8_t *addr, uint8_t val)
 
 static inline void __out_be16(volatile uint16_t *addr, uint16_t val)
 {
-	asm volatile("sthcix %0,0,%1"
-		     : : "r"(cpu_to_be16(val)), "r"(addr), "m"(*addr) : "memory");
+	val = cpu_to_be16(val);
+	if (vm_realmode())
+		asm volatile("sthcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("sthx %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
 static inline void out_be16(volatile uint16_t *addr, uint16_t val)
@@ -144,8 +196,13 @@ static inline void out_be16(volatile uint16_t *addr, uint16_t val)
 
 static inline void __out_le16(volatile uint16_t *addr, uint16_t val)
 {
-	asm volatile("sthcix %0,0,%1"
-		     : : "r"(cpu_to_le16(val)), "r"(addr), "m"(*addr) : "memory");
+	val = cpu_to_le16(val);
+	if (vm_realmode())
+		asm volatile("sthcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("sthx %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
 static inline void out_le16(volatile uint16_t *addr, uint16_t val)
@@ -156,8 +213,13 @@ static inline void out_le16(volatile uint16_t *addr, uint16_t val)
 
 static inline void __out_be32(volatile uint32_t *addr, uint32_t val)
 {
-	asm volatile("stwcix %0,0,%1"
-		     : : "r"(cpu_to_be32(val)), "r"(addr), "m"(*addr) : "memory");
+	val = cpu_to_be32(val);
+	if (vm_realmode())
+		asm volatile("stwcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stwx %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
 static inline void out_be32(volatile uint32_t *addr, uint32_t val)
@@ -168,8 +230,13 @@ static inline void out_be32(volatile uint32_t *addr, uint32_t val)
 
 static inline void __out_le32(volatile uint32_t *addr, uint32_t val)
 {
-	asm volatile("stwcix %0,0,%1"
-		     : : "r"(cpu_to_le32(val)), "r"(addr), "m"(*addr) : "memory");
+	val = cpu_to_le32(val);
+	if (vm_realmode())
+		asm volatile("stwcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stwx %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
 static inline void out_le32(volatile uint32_t *addr, uint32_t val)
@@ -180,8 +247,13 @@ static inline void out_le32(volatile uint32_t *addr, uint32_t val)
 
 static inline void __out_be64(volatile uint64_t *addr, uint64_t val)
 {
-	asm volatile("stdcix %0,0,%1"
-		     : : "r"(cpu_to_be64(val)), "r"(addr), "m"(*addr) : "memory");
+	val = cpu_to_be64(val);
+	if (vm_realmode())
+		asm volatile("stdcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stdx %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
 static inline void out_be64(volatile uint64_t *addr, uint64_t val)
@@ -192,8 +264,13 @@ static inline void out_be64(volatile uint64_t *addr, uint64_t val)
 
 static inline void __out_le64(volatile uint64_t *addr, uint64_t val)
 {
-	asm volatile("stdcix %0,0,%1"
-		     : : "r"(cpu_to_le64(val)), "r"(addr), "m"(*addr) : "memory");
+	val = cpu_to_le64(val);
+	if (vm_realmode())
+		asm volatile("stdcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stdx %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
 static inline void out_le64(volatile uint64_t *addr, uint64_t val)
diff --git a/include/mem_region.h b/include/mem_region.h
index 65eda4222..df8d03f01 100644
--- a/include/mem_region.h
+++ b/include/mem_region.h
@@ -33,6 +33,7 @@ struct mem_region {
 	struct list_node list;
 	const char *name;
 	uint64_t start, len;
+	uint64_t vm_mapped_len;
 	struct dt_node *node;
 	enum mem_region_type type;
 	struct list_head free_list;
diff --git a/include/platform.h b/include/platform.h
index 6ecdbe474..a4673eebc 100644
--- a/include/platform.h
+++ b/include/platform.h
@@ -289,8 +289,8 @@ struct platform {
 	void (*vpd_iohub_load)(struct dt_node *hub_node);
 };
 
-extern struct platform __platforms_start;
-extern struct platform __platforms_end;
+extern struct platform __platforms_start[];
+extern struct platform __platforms_end[];
 
 extern struct platform	platform;
 extern const struct bmc_platform *bmc_platform;
diff --git a/include/processor.h b/include/processor.h
index a0c2864a8..faaa7ceb0 100644
--- a/include/processor.h
+++ b/include/processor.h
@@ -39,7 +39,9 @@
 #define SPR_SRR1	0x01b	/* RW: Exception save/restore reg 1 */
 #define SPR_CFAR	0x01c	/* RW: Come From Address Register */
 #define SPR_AMR		0x01d	/* RW: Authority Mask Register */
+#define SPR_PID		0x030	/* RW: PID register */
 #define SPR_IAMR	0x03d	/* RW: Instruction Authority Mask Register */
+#define SPR_UAMOR	0x09d
 #define SPR_RPR		0x0ba   /* RW: Relative Priority Register */
 #define SPR_TBRL	0x10c	/* RO: Timebase low */
 #define SPR_TBRU	0x10d	/* RO: Timebase high */
@@ -61,10 +63,12 @@
 #define SPR_HSRR1	0x13b	/* RW: HV Exception save/restore reg 1 */
 #define SPR_TFMR	0x13d
 #define SPR_LPCR	0x13e
+#define SPR_LPID	0x13f	/* RW: LPID register */
 #define SPR_HMER	0x150	/* Hypervisor Maintenance Exception */
 #define SPR_HMEER	0x151	/* HMER interrupt enable mask */
 #define SPR_PCR		0x152
 #define SPR_AMOR	0x15d
+#define SPR_PTCR	0x1d0	/* RW: Partition table control register */
 #define SPR_PSSCR	0x357   /* RW: Stop status and control (ISA 3) */
 #define SPR_TSCR	0x399
 #define SPR_HID0	0x3f0
@@ -80,6 +84,11 @@
 #define SPR_SRR1_PM_WAKE_SRESET	0x100000
 #define SPR_SRR1_PM_WAKE_MCE	0x3c0000	/* Use reserved value for MCE */
 
+/* Bits in DSISR */
+
+#define	DSISR_ISSTORE		0x02000000
+
+
 /* Bits in LPCR */
 
 /* Powersave Exit Cause Enable is different on each generation */
@@ -312,9 +321,9 @@ static inline void isync(void)
 /*
  * Cache sync
  */
-static inline void sync_icache(void)
+static inline void sync_icache(unsigned long ptr)
 {
-	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (0) : "memory");
+	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (ptr) : "memory");
 }
 
 /*
diff --git a/include/skiboot.h b/include/skiboot.h
index 6946b8056..b2f4ec3ab 100644
--- a/include/skiboot.h
+++ b/include/skiboot.h
@@ -42,10 +42,16 @@ extern char _stext[];
 extern char _etext[];
 extern char __sym_map_end[];
 extern char _romem_end[];
+extern char __vm_mapped_romem_end[];
 
 #ifndef __TESTING__
+extern char _stext[], _etext[];
 /* Readonly section start and end. */
 extern char __rodata_start[], __rodata_end[];
+extern char _sdata[], _edata[];
+extern char __sym_map_start[], __sym_map_end[];
+extern char _sbss[], _ebss[];
+extern char _end[];
 
 static inline bool is_rodata(const void *p)
 {
@@ -184,6 +190,7 @@ extern void disable_fast_reboot(const char *reason);
 extern void add_fast_reboot_dt_entries(void);
 extern void fast_reboot(void);
 extern void __noreturn __secondary_cpu_entry(void);
+extern void __noreturn __return_cpu_entry(void);
 extern void __noreturn load_and_boot_kernel(bool is_reboot);
 extern void cleanup_local_tlb(void);
 extern void cleanup_global_tlb(void);
@@ -336,4 +343,24 @@ extern int fake_nvram_info(uint32_t *total_size);
 extern int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len);
 extern int fake_nvram_write(uint32_t offset, void *src, uint32_t size);
 
+/* core/vm.c */
+bool vm_realmode(void);
+void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci);
+void vm_map_global_text(const char *name, unsigned long addr, unsigned long len);
+void vm_unmap_global(unsigned long addr, unsigned long len);
+void *vm_map(unsigned long addr, unsigned long len, bool rw);
+void vm_unmap(unsigned long addr, unsigned long len);
+void vm_init(bool fast_reboot);
+void vm_init_stacks(void);
+void vm_destroy(void);
+void vm_init_secondary(void);
+void vm_enter(void);
+void vm_exit(void);
+void vm_exit_cleanup(void);
+void vm_map_stacks(void);
+bool vm_dslb(uint64_t nia, uint64_t dar);
+bool vm_islb(uint64_t nia);
+bool vm_dsi(uint64_t nia, uint64_t dar, bool store);
+bool vm_isi(uint64_t nia);
+
 #endif /* __SKIBOOT_H */
diff --git a/libstb/container.c b/libstb/container.c
index 58fd18f9e..63c99406e 100644
--- a/libstb/container.c
+++ b/libstb/container.c
@@ -6,14 +6,20 @@
 
 bool stb_is_container(const void *buf, size_t size)
 {
+	uint32_t *t;
 	ROM_container_raw *c;
+	bool ret = true;;
 
 	c = (ROM_container_raw*) buf;
 	if (!buf || size < SECURE_BOOT_HEADERS_SIZE)
 		return false;
-	if (be32_to_cpu(c->magic_number) != ROM_MAGIC_NUMBER )
-		return false;
-	return true;
+
+	t = vm_map((unsigned long)&c->magic_number, sizeof(*t), false);
+	if (be32_to_cpu(*t) != ROM_MAGIC_NUMBER)
+		ret = false;
+	vm_unmap((unsigned long)&c->magic_number, sizeof(*t));
+
+	return ret;
 }
 
 uint32_t stb_payload_magic(const void *buf, size_t size)
diff --git a/libstb/cvc.c b/libstb/cvc.c
index 356d36bd9..4dee64080 100644
--- a/libstb/cvc.c
+++ b/libstb/cvc.c
@@ -155,6 +155,9 @@ static int cvc_reserved_mem_init(struct dt_node *parent) {
 		return -1;
 	}
 	addr = dt_get_address(cvc_resv_mem, 0, &size);
+	if (size == 0) // MAMBO HACK
+		size = 64*1024;
+	vm_map_global_text("STB-CVC", addr, size);
 	cvc_register(addr, addr + size-1);
 
 	exports = dt_find_by_path(dt_root, "/ibm,opal/firmware/exports");
diff --git a/libstb/secureboot.c b/libstb/secureboot.c
index 022e2aa09..dda2fc5b2 100644
--- a/libstb/secureboot.c
+++ b/libstb/secureboot.c
@@ -164,6 +164,7 @@ int secureboot_verify(enum resource_id id, void *buf, size_t len)
 {
 	const char *name;
 	__be64 log;
+	void *vbuf;
 	int rc = -1;
 
 	name = flash_map_resource_name(id);
@@ -181,7 +182,9 @@ int secureboot_verify(enum resource_id id, void *buf, size_t len)
 		return -1;
         }
 
-	rc = call_cvc_verify(buf, len, hw_key_hash, hw_key_hash_size, &log);
+	vbuf = vm_map((unsigned long)buf, len, false);
+	rc = call_cvc_verify(vbuf, len, hw_key_hash, hw_key_hash_size, &log);
+	vm_unmap((unsigned long)buf, len);
 
 	if (rc == OPAL_SUCCESS) {
 		prlog(PR_NOTICE, "%s verified\n", name);
diff --git a/libstb/trustedboot.c b/libstb/trustedboot.c
index 3f977de10..cd5b5207d 100644
--- a/libstb/trustedboot.c
+++ b/libstb/trustedboot.c
@@ -161,7 +161,7 @@ out_free:
 int trustedboot_measure(enum resource_id id, void *buf, size_t len)
 {
 	uint8_t digest[SHA512_DIGEST_LENGTH];
-	void *buf_aux;
+	void *buf_aux, *vbuf;
 	size_t len_aux;
 	const char *name;
 	TPM_Pcr pcr;
@@ -219,7 +219,9 @@ int trustedboot_measure(enum resource_id id, void *buf, size_t len)
 		len_aux = len;
 	}
 
-	rc = call_cvc_sha512(buf_aux, len_aux, digest, SHA512_DIGEST_LENGTH);
+	vbuf = vm_map((unsigned long)buf_aux, len_aux, false);
+	rc = call_cvc_sha512(vbuf, len_aux, digest, SHA512_DIGEST_LENGTH);
+	vm_unmap((unsigned long)buf_aux, len_aux);
 
 	if (rc == OPAL_SUCCESS) {
 		prlog(PR_NOTICE, "%s hash calculated\n", name);
diff --git a/skiboot.lds.S b/skiboot.lds.S
index 0fc0e7c8f..577b25185 100644
--- a/skiboot.lds.S
+++ b/skiboot.lds.S
@@ -123,12 +123,26 @@ SECTIONS
 		__rodata_end = .;
 	}
 
+	. = ALIGN(0x100);
+	.got : {
+		__toc_start = . + 0x8000;
+		*(.got)
+		*(.toc)
+	}
+
+	. = ALIGN(0x10);
+	.opd : {
+		*(.opd)
+	}
+
 	. = ALIGN(0x10);
 	.trap_table : {
 		__trap_table_start = .;
 		KEEP(*(.trap_table))
 		__trap_table_end = .;
 	}
+	__vm_mapped_romem_end = .;
+	. = ALIGN(PAGE_SIZE);
 
 	. = ALIGN(0x10);
 	.init : {
@@ -139,18 +153,6 @@ SECTIONS
 		__ctors_end = .;
 	}
 
-	. = ALIGN(0x10);
-	.opd : {
-		*(.opd)
-	}
-  
-	. = ALIGN(0x100);
-	.got : {
-		__toc_start = . + 0x8000;
-		*(.got)
-		*(.toc)
-	}
-
 	. = ALIGN(0x10);
 	.opal_table : {
 		__opal_table_start = .;
-- 
2.23.0



More information about the Skiboot mailing list