[Skiboot] [RFC PATCH 2/3] virtual memory for OPAL boot

Nicholas Piggin npiggin at gmail.com
Wed Jun 5 12:36:15 AEST 2019


vm_map / vm_unmap is a per-cpu mapping which can not nest. This returns
an EA which is different than the PA of the memory when in vmm mode.

vm_map_global / vm_unmap_global sets up globally visible 1:1 mappings.

A list of global extents + a local extent per cpu is kept to describe
active mappings. Fault handlers look these up to install SLB/HPTE entries.

This should move toward having fewer global mappings for things kept
around, and unmap them when finished, or better yet move to local
mappings.
---
 core/Makefile.inc    |   2 +-
 core/cpu.c           |  19 +-
 core/exceptions.c    |  40 ++-
 core/fast-reboot.c   |  30 +-
 core/flash.c         |   4 +-
 core/init.c          | 169 +++++++--
 core/mem_region.c    |  76 ++--
 core/opal.c          |  20 +-
 core/vm.c            | 812 +++++++++++++++++++++++++++++++++++++++++++
 hdata/spira.c        |  21 +-
 hw/fake-nvram.c      |  12 +-
 hw/homer.c           |   5 +
 hw/lpc-uart.c        |  31 +-
 hw/lpc.c             |   2 +
 hw/phb4.c            |   9 +-
 hw/psi.c             |   2 +
 hw/slw.c             |   4 +-
 hw/xive.c            |   5 +
 hw/xscom.c           |   4 +-
 include/cmpxchg.h    |   3 +
 include/cpu.h        |  22 ++
 include/elf-abi.h    |  20 +-
 include/io.h         |  57 ++-
 include/mem_region.h |   1 +
 include/processor.h  |  13 +-
 include/skiboot.h    |  27 ++
 libstb/container.c   |  12 +-
 skiboot.lds.S        |  56 +--
 28 files changed, 1354 insertions(+), 124 deletions(-)
 create mode 100644 core/vm.c

diff --git a/core/Makefile.inc b/core/Makefile.inc
index 21c12fb8d..cdc4adb8e 100644
--- a/core/Makefile.inc
+++ b/core/Makefile.inc
@@ -1,7 +1,7 @@
 # -*-Makefile-*-
 
 SUBDIRS += core
-CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
+CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o vm.o
 CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
 CORE_OBJS += opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
 CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
diff --git a/core/cpu.c b/core/cpu.c
index 54111a954..08bc78d7f 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -389,6 +389,7 @@ static unsigned int cpu_idle_p8(enum cpu_wake_cause wake_on)
 	uint64_t lpcr = mfspr(SPR_LPCR) & ~SPR_LPCR_P8_PECE;
 	struct cpu_thread *cpu = this_cpu();
 	unsigned int vec = 0;
+	bool vm_setup = cpu->vm_setup;
 
 	if (!pm_enabled) {
 		prlog_once(PR_DEBUG, "cpu_idle_p8 called pm disabled\n");
@@ -429,8 +430,13 @@ static unsigned int cpu_idle_p8(enum cpu_wake_cause wake_on)
 	}
 	isync();
 
+	if (vm_setup)
+		vm_exit();
 	/* Enter nap */
 	vec = enter_p8_pm_state(false);
+	mtmsrd(MSR_RI, 1);
+	if (vm_setup)
+		vm_enter();
 
 skip_sleep:
 	/* Restore */
@@ -485,15 +491,24 @@ static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
 	isync();
 
 	if (sreset_enabled) {
+		bool vm_setup = cpu->vm_setup;
+
 		/* stop with EC=1 (sreset) and ESL=1 (enable thread switch). */
 		/* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */
 		psscr = PPC_BIT(42) | PPC_BIT(43) |
 			PPC_BITMASK(54, 55) | PPC_BIT(63);
+		if (vm_setup)
+			vm_exit();
 		vec = enter_p9_pm_state(psscr);
+		/* XXX don't enable VM if 0x100 or 0x200 */
+		mtmsrd(MSR_RI, 1);
+		if (vm_setup)
+			vm_enter();
 	} else {
 		/* stop with EC=0 (resumes) which does not require sreset. */
 		/* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */
 		psscr = PPC_BITMASK(54, 55) | PPC_BIT(63);
+		/* Can run with VM enabled */
 		enter_p9_pm_lite_state(psscr);
 	}
 
@@ -536,12 +551,10 @@ static void cpu_idle_pm(enum cpu_wake_cause wake_on)
 		default:
 			break;
 		}
-		mtmsrd(MSR_RI, 1);
 
 	} else if (vec == 0x200) {
 		exception_entry_pm_mce();
 		enable_machine_check();
-		mtmsrd(MSR_RI, 1);
 	}
 }
 
@@ -1374,7 +1387,7 @@ static int64_t opal_return_cpu(void)
 		printf("OPAL in_opal_call=%u\n", this_cpu()->in_opal_call);
 	}
 
-	__secondary_cpu_entry();
+	__return_cpu_entry();
 
 	return OPAL_HARDWARE; /* Should not happen */
 }
diff --git a/core/exceptions.c b/core/exceptions.c
index 5e453264e..89b4451ab 100644
--- a/core/exceptions.c
+++ b/core/exceptions.c
@@ -98,6 +98,41 @@ void exception_entry(struct stack_frame *stack)
 			"Fatal MCE at "REG"   ", nip);
 		break;
 
+	case 0x300:
+		if (vm_dsi(nip, stack->dar, !!(stack->dsisr & DSISR_ISSTORE)))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal %s address "REG" at "REG"   ",
+			(stack->dsisr & DSISR_ISSTORE) ? "store" : "load",
+			stack->dar, nip);
+		break;
+
+	case 0x380:
+		if (vm_dslb(nip, stack->dar))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal load/store address "REG" at "REG"   ",
+			stack->dar, nip);
+		break;
+
+	case 0x400:
+		if (vm_isi(nip))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal ifetch at "REG"   ", nip);
+		break;
+
+	case 0x480:
+		if (vm_islb(nip))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal ifetch at "REG"   ", nip);
+		break;
+
 	default:
 		fatal = true;
 		prerror("***********************************************\n");
@@ -110,10 +145,11 @@ void exception_entry(struct stack_frame *stack)
 	prerror("%s\n", buf);
 	dump_regs(stack);
 
+	if (!fatal)
+		backtrace();
+out:
 	if (fatal)
 		abort();
-	else
-		backtrace();
 
 	if (hv) {
 		/* Set up for SRR return */
diff --git a/core/fast-reboot.c b/core/fast-reboot.c
index 07f83a30f..c10b78d33 100644
--- a/core/fast-reboot.c
+++ b/core/fast-reboot.c
@@ -355,6 +355,9 @@ void __noreturn fast_reboot_entry(void)
 	 * up and go processing jobs.
 	 */
 	if (this_cpu() != boot_cpu) {
+		cleanup_cpu_state();
+
+		sync();
 		if (!fast_boot_release) {
 			smt_lowest();
 			while (!fast_boot_release)
@@ -362,9 +365,6 @@ void __noreturn fast_reboot_entry(void)
 			smt_medium();
 		}
 		sync();
-		cleanup_cpu_state();
-		enable_machine_check();
-		mtmsrd(MSR_RI, 1);
 
 		__secondary_cpu_entry();
 	}
@@ -379,15 +379,22 @@ void __noreturn fast_reboot_entry(void)
 	if (proc_gen == proc_gen_p9)
 		xive_reset();
 
+	/* Cleanup ourselves */
+	cleanup_cpu_state();
+
+	/* XXX: need this? */
+	enable_machine_check();
+	mtmsrd(MSR_RI, 1);
+
+	/* Enter virtual memory mode */
+	vm_init();
+
 	prlog(PR_INFO, "RESET: Releasing secondaries...\n");
 
 	/* Release everybody */
 	sync();
 	fast_boot_release = true;
 
-	/* Cleanup ourselves */
-	cleanup_cpu_state();
-
 	/* Set our state to active */
 	sync();
 	this_cpu()->state = cpu_state_active;
@@ -414,6 +421,7 @@ void __noreturn fast_reboot_entry(void)
 	cpu_set_ipi_enable(true);
 
 	if (!chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+		void *t;
 		/*
 		 * mem_region_clear_unused avoids these preload regions
 		 * so it can run along side image preloading. Clear these
@@ -423,8 +431,14 @@ void __noreturn fast_reboot_entry(void)
 		 * Mambo may have embedded payload here, so don't clear
 		 * it at all.
 		 */
-		memset(KERNEL_LOAD_BASE, 0, KERNEL_LOAD_SIZE);
-		memset(INITRAMFS_LOAD_BASE, 0, INITRAMFS_LOAD_SIZE);
+
+		t = vm_map((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true);
+		memset(t, 0, KERNEL_LOAD_SIZE);
+		vm_unmap((unsigned long)t, KERNEL_LOAD_SIZE);
+
+		t = vm_map((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true);
+		memset(t, 0, INITRAMFS_LOAD_SIZE);
+		vm_unmap((unsigned long)t, INITRAMFS_LOAD_SIZE);
 	}
 
 	/* Start preloading kernel and ramdisk */
diff --git a/core/flash.c b/core/flash.c
index 3da6d4a42..420ae3244 100644
--- a/core/flash.c
+++ b/core/flash.c
@@ -762,9 +762,11 @@ done_reading:
 	 * Verify and measure the retrieved PNOR partition as part of the
 	 * secure boot and trusted boot requirements
 	 */
+#if 0
+// XXX: this chekstops
 	secureboot_verify(id, buf, *len);
 	trustedboot_measure(id, buf, *len);
-
+#endif
 	/* Find subpartition */
 	if (subid != RESOURCE_SUBID_NONE) {
 		memmove(buf, bufp, content_size);
diff --git a/core/init.c b/core/init.c
index 3db9df314..0fad02f67 100644
--- a/core/init.c
+++ b/core/init.c
@@ -91,6 +91,7 @@ static bool try_load_elf64_le(struct elf_hdr *header)
 	uint64_t load_base = (uint64_t)kh;
 	struct elf64_phdr *ph;
 	unsigned int i;
+	bool ret = false;
 
 	printf("INIT: 64-bit LE kernel discovered\n");
 
@@ -102,6 +103,9 @@ static bool try_load_elf64_le(struct elf_hdr *header)
 	 * but it will not work for any ELF binary.
 	 */
 	ph = (struct elf64_phdr *)(load_base + le64_to_cpu(kh->e_phoff));
+	vm_map_global("KERNEL ELF Program Headers", (unsigned long)ph,
+			le16_to_cpu(kh->e_phnum)*sizeof(struct elf64_phdr),
+			false, false);
 	for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) {
 		if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
 			continue;
@@ -118,7 +122,7 @@ static bool try_load_elf64_le(struct elf_hdr *header)
 
 	if (!kernel_entry) {
 		prerror("INIT: Failed to find kernel entry !\n");
-		return false;
+		goto out_unmap;
 	}
 	kernel_entry += load_base;
 	kernel_32bit = false;
@@ -130,7 +134,12 @@ static bool try_load_elf64_le(struct elf_hdr *header)
 	prlog(PR_DEBUG, "INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
 	      kernel_entry, kernel_size);
 
-	return true;
+	ret = true;
+
+out_unmap:
+	vm_unmap_global((unsigned long)ph, le16_to_cpu(kh->e_phnum)*sizeof(struct elf64_phdr));
+
+	return ret;
 }
 
 static bool try_load_elf64(struct elf_hdr *header)
@@ -140,12 +149,17 @@ static bool try_load_elf64(struct elf_hdr *header)
 	struct elf64_phdr *ph;
 	struct elf64_shdr *sh;
 	unsigned int i;
+	bool ret = false;
+
+	vm_map_global("KERNEL ELF64 Header", (unsigned long)header,
+			sizeof(struct elf64_hdr), false, false);
 
 	/* Check it's a ppc64 LE ELF */
 	if (kh->ei_ident == ELF_IDENT		&&
 	    kh->ei_data == ELF_DATA_LSB		&&
 	    kh->e_machine == le16_to_cpu(ELF_MACH_PPC64)) {
-		return try_load_elf64_le(header);
+		ret = try_load_elf64_le(header);
+		goto out_unmap1;
 	}
 
 	/* Check it's a ppc64 ELF */
@@ -153,7 +167,7 @@ static bool try_load_elf64(struct elf_hdr *header)
 	    kh->ei_data != ELF_DATA_MSB		||
 	    kh->e_machine != ELF_MACH_PPC64) {
 		prerror("INIT: Kernel doesn't look like an ppc64 ELF\n");
-		return false;
+		goto out_unmap1;
 	}
 
 	/* Look for a loadable program header that has our entry in it
@@ -164,6 +178,8 @@ static bool try_load_elf64(struct elf_hdr *header)
 	 * but it will not work for any ELF binary.
 	 */
 	ph = (struct elf64_phdr *)(load_base + kh->e_phoff);
+	vm_map_global("KERNEL ELF Program Headers", (unsigned long)ph,
+			kh->e_phnum*sizeof(struct elf64_phdr), false, false);
 	for (i = 0; i < kh->e_phnum; i++, ph++) {
 		if (ph->p_type != ELF_PTYPE_LOAD)
 			continue;
@@ -178,7 +194,7 @@ static bool try_load_elf64(struct elf_hdr *header)
 
 	if (!kernel_entry) {
 		prerror("INIT: Failed to find kernel entry !\n");
-		return false;
+		goto out_unmap2;
 	}
 
 	/* For the normal big-endian ELF ABI, the kernel entry points
@@ -188,6 +204,8 @@ static bool try_load_elf64(struct elf_hdr *header)
 	 * to assuming it obeys the ABI.
 	 */
 	sh = (struct elf64_shdr *)(load_base + kh->e_shoff);
+	vm_map_global("KERNEL ELF Section Headers", (unsigned long)sh,
+			kh->e_shnum*sizeof(struct elf64_shdr), false, false);
 	for (i = 0; i < kh->e_shnum; i++, sh++) {
 		if (sh->sh_addr <= kh->e_entry &&
 		      (sh->sh_addr + sh->sh_size) > kh->e_entry)
@@ -208,7 +226,15 @@ static bool try_load_elf64(struct elf_hdr *header)
 	printf("INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
 	       kernel_entry, kernel_size);
 
-	return true;
+	ret = true;
+
+	vm_unmap_global((unsigned long)sh, kh->e_shnum*sizeof(struct elf64_shdr));
+out_unmap2:
+	vm_unmap_global((unsigned long)ph, kh->e_phnum*sizeof(struct elf64_phdr));
+out_unmap1:
+	vm_unmap_global((unsigned long)header, sizeof(struct elf64_hdr));
+
+	return ret;
 }
 
 static bool try_load_elf32_le(struct elf_hdr *header)
@@ -321,6 +347,7 @@ bool start_preload_kernel(void)
 	int loaded;
 
 	/* Try to load an external kernel payload through the platform hooks */
+	vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true, false);
 	kernel_size = KERNEL_LOAD_SIZE;
 	loaded = start_preload_resource(RESOURCE_ID_KERNEL,
 					RESOURCE_SUBID_NONE,
@@ -329,9 +356,11 @@ bool start_preload_kernel(void)
 	if (loaded != OPAL_SUCCESS) {
 		printf("INIT: platform start load kernel failed\n");
 		kernel_size = 0;
+		vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
 		return false;
 	}
 
+	vm_map_global("INITRAMFS", (unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true, false);
 	initramfs_size = INITRAMFS_LOAD_SIZE;
 	loaded = start_preload_resource(RESOURCE_ID_INITRAMFS,
 					RESOURCE_SUBID_NONE,
@@ -339,6 +368,7 @@ bool start_preload_kernel(void)
 	if (loaded != OPAL_SUCCESS) {
 		printf("INIT: platform start load initramfs failed\n");
 		initramfs_size = 0;
+		vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
 		return false;
 	}
 
@@ -348,13 +378,16 @@ bool start_preload_kernel(void)
 static bool load_kernel(void)
 {
 	void *stb_container = NULL;
-	struct elf_hdr *kh;
+	struct elf_hdr *kh, *t;
+	uint32_t ei_ident;
+	uint8_t ei_class;
 	int loaded;
 
 	prlog(PR_NOTICE, "INIT: Waiting for kernel...\n");
 
 	loaded = wait_for_resource_loaded(RESOURCE_ID_KERNEL,
 					  RESOURCE_SUBID_NONE);
+	vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
 
 	if (loaded != OPAL_SUCCESS) {
 		printf("INIT: platform wait for kernel load failed\n");
@@ -370,8 +403,10 @@ static bool load_kernel(void)
 				((uint64_t)__builtin_kernel_start) -
 				SKIBOOT_BASE + boot_offset;
 			printf("Using built-in kernel\n");
+			vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, kernel_size, true, false);
 			memmove(KERNEL_LOAD_BASE, (void*)builtin_base,
 				kernel_size);
+			vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, kernel_size);
 		}
 	}
 
@@ -387,7 +422,7 @@ static bool load_kernel(void)
 		if (kernel_entry < EXCEPTION_VECTORS_END) {
 			cpu_set_sreset_enable(false);
 			memcpy_null(NULL, old_vectors, EXCEPTION_VECTORS_END);
-			sync_icache();
+			sync_icache(0);
 		} else {
 			/* Hack for STB in Mambo, assume at least 4kb in mem */
 			if (!kernel_size)
@@ -418,15 +453,20 @@ static bool load_kernel(void)
 	      "INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n",
 	      kernel_size);
 
-	if (kh->ei_ident != ELF_IDENT) {
+	t = vm_map((unsigned long)kh, sizeof(*kh), false);
+	ei_ident = t->ei_ident;
+	ei_class = t->ei_class;
+	vm_unmap((unsigned long)t, sizeof(*kh));
+
+	if (ei_ident != ELF_IDENT) {
 		prerror("INIT: ELF header not found. Assuming raw binary.\n");
 		return true;
 	}
 
-	if (kh->ei_class == ELF_CLASS_64) {
+	if (ei_class == ELF_CLASS_64) {
 		if (!try_load_elf64(kh))
 			return false;
-	} else if (kh->ei_class == ELF_CLASS_32) {
+	} else if (ei_class == ELF_CLASS_32) {
 		if (!try_load_elf32(kh))
 			return false;
 	} else {
@@ -454,7 +494,7 @@ static void load_initramfs(void)
 
 	loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
 					  RESOURCE_SUBID_NONE);
-
+	vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
 	if (loaded != OPAL_SUCCESS || !initramfs_size)
 		return;
 
@@ -526,6 +566,7 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
 	const struct dt_property *memprop;
 	const char *cmdline, *stdoutp;
 	uint64_t mem_top;
+	uint32_t *t;
 
 	memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem");
 	if (memprop)
@@ -619,11 +660,13 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
 
 	fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir);
 
+	t = vm_map(kernel_entry, 4, false);
 	/* Check there is something there before we branch to it */
-	if (*(uint32_t *)kernel_entry == 0) {
+	if (*t == 0) {
 		prlog(PR_EMERG, "FATAL: Kernel is zeros, can't execute!\n");
 		assert(0);
 	}
+	vm_unmap(kernel_entry, 4);
 
 	/* Take processors out of nap */
 	cpu_set_sreset_enable(false);
@@ -632,6 +675,9 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
 	printf("INIT: Starting kernel at 0x%llx, fdt at %p %u bytes\n",
 	       kernel_entry, fdt, fdt_totalsize(fdt));
 
+	/* Go back to realmode and tear down our VM before booting kernel */
+	vm_destroy();
+
 	/* Disable machine checks on all */
 	cpu_disable_ME_RI_all();
 
@@ -798,34 +844,55 @@ static void setup_branch_null_catcher(void)
 
 void copy_sreset_vector(void)
 {
+	static char patch[0x100];
 	uint32_t *src, *dst;
+	uint32_t *t;
+	uint32_t len = (void *)&reset_patch_end - (void *)&reset_patch_start;
 
 	/* Copy the reset code over the entry point. */
 	src = &reset_patch_start;
+	t = vm_map((unsigned long)src, len, false);
+	memcpy(patch, t, len);
+	vm_unmap((unsigned long)src, len);
+
 	dst = (uint32_t *)0x100;
-	while(src < &reset_patch_end)
-		*(dst++) = *(src++);
-	sync_icache();
+	t = vm_map((unsigned long)dst, len, true);
+	memcpy(t, patch, len);
+	sync_icache((unsigned long)t);
+	vm_unmap((unsigned long)dst, len);
 }
 
 void copy_sreset_vector_fast_reboot(void)
 {
+	static char patch[0x100];
 	uint32_t *src, *dst;
+	uint32_t *t;
+	uint32_t len = (void *)&reset_fast_reboot_patch_end -
+			(void *)&reset_fast_reboot_patch_start;
 
 	/* Copy the reset code over the entry point. */
 	src = &reset_fast_reboot_patch_start;
+	t = vm_map((unsigned long)src, len, false);
+	memcpy(patch, t, len);
+	vm_unmap((unsigned long)src, len);
+
 	dst = (uint32_t *)0x100;
-	while(src < &reset_fast_reboot_patch_end)
-		*(dst++) = *(src++);
-	sync_icache();
+	t = vm_map((unsigned long)dst, len, true);
+	memcpy(t, patch, len);
+	sync_icache((unsigned long)t);
+	vm_unmap((unsigned long)dst, len);
 }
 
 void copy_exception_vectors(void)
 {
+	void *t;
+
+	t = vm_map(0x0, 0x2000, true);
+
 	/* Backup previous vectors as this could contain a kernel
 	 * image.
 	 */
-	memcpy_null(old_vectors, NULL, EXCEPTION_VECTORS_END);
+	memcpy(old_vectors, t, EXCEPTION_VECTORS_END);
 
 	/* Copy from 0x100 to EXCEPTION_VECTORS_END, avoid below 0x100 as
 	 * this is the boot flag used by CPUs still potentially entering
@@ -833,9 +900,10 @@ void copy_exception_vectors(void)
 	 */
 	BUILD_ASSERT((&reset_patch_end - &reset_patch_start) <
 			EXCEPTION_VECTORS_END - 0x100);
-	memcpy((void *)0x100, (void *)(SKIBOOT_BASE + 0x100),
+	memcpy(t + 0x100, (void *)(SKIBOOT_BASE + 0x100),
 			EXCEPTION_VECTORS_END - 0x100);
-	sync_icache();
+	sync_icache((unsigned long)t);
+	vm_unmap(0x0, 0x2000);
 }
 
 static void per_thread_sanity_checks(void)
@@ -899,16 +967,25 @@ static uint32_t romem_csum;
 
 static void checksum_romem(void)
 {
+	void *t;
+	unsigned long size;
 	uint32_t csum;
 
 	romem_csum = 0;
 	if (chip_quirk(QUIRK_SLOW_SIM))
 		return;
 
-	csum = mem_csum(_start, _romem_end);
+	size = (unsigned long)_romem_end - (unsigned long)_start;
+	t = vm_map((unsigned long)_start, size, false);
+	csum = mem_csum(t, t + size);
 	romem_csum ^= csum;
-	csum = mem_csum(__builtin_kernel_start, __builtin_kernel_end);
+	vm_unmap((unsigned long)_start, size);
+
+	size = (unsigned long)__builtin_kernel_end - (unsigned long)__builtin_kernel_start;
+	t = vm_map((unsigned long)__builtin_kernel_start, size, false);
+	csum = mem_csum(t, t + size);
 	romem_csum ^= csum;
+	vm_unmap((unsigned long)__builtin_kernel_start, size);
 }
 
 bool verify_romem(void)
@@ -984,7 +1061,7 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	prlog(PR_DEBUG, "initial console log level: memory %d, driver %d\n",
 	       (debug_descriptor.console_log_levels >> 4),
 	       (debug_descriptor.console_log_levels & 0x0f));
-	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology.\n");
+	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology. Now with more indirection.\n");
 
 #ifdef SKIBOOT_GCOV
 	skiboot_gcov_done();
@@ -996,6 +1073,9 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	/* Now locks can be used */
 	init_locks();
 
+	/* Enter virtual memory mode */
+	vm_init();
+
 	/* Create the OPAL call table early on, entries can be overridden
 	 * later on (FSP console code for example)
 	 */
@@ -1021,7 +1101,20 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
 		if (parse_hdat(false) < 0)
 			abort();
 	} else {
+		void *t;
+		uint32_t size;
+
+		t = vm_map((unsigned long)fdt, sizeof(struct fdt_header), false);
+		size = fdt_totalsize(t);
+		vm_unmap((unsigned long)fdt, sizeof(struct fdt_header));
+
+		/*
+		 * Would be nice to make this a local map, but it seems
+		 * to need to be expanded in place.
+		 */
+		vm_map_global("fdt", (unsigned long)fdt, size, false, false);
 		dt_expand(fdt);
+		vm_unmap_global((unsigned long)fdt, size);
 	}
 	dt_add_cpufeatures(dt_root);
 
@@ -1072,6 +1165,8 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	 */
 	init_cpu_max_pir();
 
+	vm_init_stacks();
+
 	/*
 	 * Now, we init our memory map from the device-tree, and immediately
 	 * reserve areas which we know might contain data coming from
@@ -1308,6 +1403,30 @@ void __noreturn __secondary_cpu_entry(void)
 	enable_machine_check();
 	mtmsrd(MSR_RI, 1);
 
+	vm_init_secondary();
+
+	/* Some XIVE setup */
+	xive_cpu_callin(cpu);
+
+	/* Wait for work to do */
+	while(true) {
+		if (cpu_check_jobs(cpu))
+			cpu_process_jobs();
+		else
+			cpu_idle_job();
+	}
+}
+
+void __noreturn __return_cpu_entry(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	/* Secondary CPU called in */
+	cpu_callin(cpu);
+
+	enable_machine_check();
+	mtmsrd(MSR_RI, 1);
+
 	/* Some XIVE setup */
 	xive_cpu_callin(cpu);
 
diff --git a/core/mem_region.c b/core/mem_region.c
index 74551922b..fe89cedb6 100644
--- a/core/mem_region.c
+++ b/core/mem_region.c
@@ -66,24 +66,27 @@ static struct mem_region skiboot_os_reserve = {
 	.type		= REGION_OS,
 };
 
-struct mem_region skiboot_heap = {
-	.name		= "ibm,firmware-heap",
-	.start		= HEAP_BASE,
-	.len		= HEAP_SIZE,
-	.type		= REGION_SKIBOOT_HEAP,
-};
-
 static struct mem_region skiboot_code_and_text = {
 	.name		= "ibm,firmware-code",
 	.start		= SKIBOOT_BASE,
 	.len		= HEAP_BASE - SKIBOOT_BASE,
+	.vm_mapped_len	= HEAP_BASE - SKIBOOT_BASE,
 	.type		= REGION_SKIBOOT_FIRMWARE,
 };
 
+struct mem_region skiboot_heap = {
+	.name		= "ibm,firmware-heap",
+	.start		= HEAP_BASE,
+	.len		= HEAP_SIZE,
+	.vm_mapped_len	= HEAP_SIZE,
+	.type		= REGION_SKIBOOT_HEAP,
+};
+
 static struct mem_region skiboot_after_heap = {
 	.name		= "ibm,firmware-data",
 	.start		= HEAP_BASE + HEAP_SIZE,
 	.len		= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
+	.vm_mapped_len	= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
 	.type		= REGION_SKIBOOT_FIRMWARE,
 };
 
@@ -153,14 +156,6 @@ static struct alloc_hdr *next_hdr(const struct mem_region *region,
 #if POISON_MEM_REGION == 1
 static void mem_poison(struct free_hdr *f)
 {
-	size_t poison_size = (void*)tailer(f) - (void*)(f+1);
-
-	/* We only poison up to a limit, as otherwise boot is
-	 * kinda slow */
-	if (poison_size > POISON_MEM_REGION_LIMIT)
-		poison_size = POISON_MEM_REGION_LIMIT;
-
-	memset(f+1, POISON_MEM_REGION_WITH, poison_size);
 }
 #endif
 
@@ -168,23 +163,42 @@ static void mem_poison(struct free_hdr *f)
 static void init_allocatable_region(struct mem_region *region)
 {
 	struct free_hdr *f = region_start(region);
+	unsigned long num_longs;
+	unsigned long *t;
+
 	assert(region->type == REGION_SKIBOOT_HEAP ||
 	       region->type == REGION_MEMORY);
-	f->hdr.num_longs = region->len / sizeof(long);
+
+	num_longs = region->len / sizeof(long);
+
+	if (!region->vm_mapped_len) {
+		/* SKIBOOT_BASE-SIZE regions already come mapped */
+		region->vm_mapped_len = PAGE_SIZE;
+		vm_map_global(region->name, region->start, PAGE_SIZE, true, false);
+	}
+
+	assert(PAGE_SIZE >= sizeof(*f));
+	assert(region->len >= PAGE_SIZE*2);
+
+	f->hdr.num_longs = num_longs;
 	f->hdr.free = true;
 	f->hdr.prev_free = false;
-	*tailer(f) = f->hdr.num_longs;
 	list_head_init(&region->free_list);
 	list_add(&region->free_list, &f->list);
-#if POISON_MEM_REGION == 1
+#if 0 && POISON_MEM_REGION == 1
 	mem_poison(f);
 #endif
+
+	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
+	*t = num_longs;
+	vm_unmap((unsigned long)tailer(f), sizeof(long));
 }
 
 static void make_free(struct mem_region *region, struct free_hdr *f,
 		      const char *location, bool skip_poison)
 {
 	struct alloc_hdr *next;
+	unsigned long *t;
 
 #if POISON_MEM_REGION == 1
 	if (!skip_poison)
@@ -212,7 +226,9 @@ static void make_free(struct mem_region *region, struct free_hdr *f,
 	}
 
 	/* Fix up tailer. */
-	*tailer(f) = f->hdr.num_longs;
+	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
+	*t = f->hdr.num_longs;
+	vm_unmap((unsigned long)tailer(f), sizeof(long));
 
 	/* If next is free, coalesce it */
 	next = next_hdr(region, &f->hdr);
@@ -401,6 +417,7 @@ static void *__mem_alloc(struct mem_region *region, size_t size, size_t align,
 	size_t alloc_longs, offset;
 	struct free_hdr *f;
 	struct alloc_hdr *next;
+	unsigned long newsz;
 
 	/* Align must be power of 2. */
 	assert(!((align - 1) & align));
@@ -456,6 +473,17 @@ found:
 		next->prev_free = false;
 	}
 
+	newsz = ((void *)((unsigned long *)f + alloc_longs + offset) - region_start(region) + sizeof(struct free_hdr));
+	if (newsz > region->vm_mapped_len) {
+		/* TODO: unmap on free */
+		newsz += PAGE_SIZE-1;
+		newsz &= ~(PAGE_SIZE-1);
+		vm_map_global(location,
+			region->start + region->vm_mapped_len,
+			newsz - region->vm_mapped_len, true, false);
+		region->vm_mapped_len = newsz;
+	}
+
 	if (offset != 0) {
 		struct free_hdr *pre = f;
 
@@ -700,6 +728,7 @@ static struct mem_region *new_region(const char *name,
 	region->name = name;
 	region->start = start;
 	region->len = len;
+	region->vm_mapped_len = 0;
 	region->node = node;
 	region->type = type;
 	region->free_list.n.next = NULL;
@@ -1232,6 +1261,7 @@ void mem_region_release_unused(void)
 static void mem_clear_range(uint64_t s, uint64_t e)
 {
 	uint64_t res_start, res_end;
+	void *t;
 
 	/* Skip exception vectors */
 	if (s < EXCEPTION_VECTORS_END)
@@ -1271,7 +1301,10 @@ static void mem_clear_range(uint64_t s, uint64_t e)
 
 	prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
 	      (long long)s, (long long)e);
-	memset((void *)s, 0, e - s);
+
+	t = vm_map(s, e - s, true);
+	memset(t, 0, e - s);
+	vm_unmap(s, e - s);
 }
 
 struct mem_region_clear_job_args {
@@ -1285,7 +1318,8 @@ static void mem_region_clear_job(void *data)
 	mem_clear_range(arg->s, arg->e);
 }
 
-#define MEM_REGION_CLEAR_JOB_SIZE (16ULL*(1<<30))
+/* Limited by 256MB segment size (could fix) */
+#define MEM_REGION_CLEAR_JOB_SIZE (128ULL*(1<<20))
 
 static struct cpu_job **mem_clear_jobs;
 static struct mem_region_clear_job_args *mem_clear_job_args;
diff --git a/core/opal.c b/core/opal.c
index 3a2fbb95b..df6e70a3c 100644
--- a/core/opal.c
+++ b/core/opal.c
@@ -68,7 +68,16 @@ void opal_table_init(void)
 	prlog(PR_DEBUG, "OPAL table: %p .. %p, branch table: %p\n",
 	      s, e, opal_branch_table);
 	while(s < e) {
-		opal_branch_table[s->token] = function_entry_address(s->func);
+		uint64_t f;
+		uint64_t *t;
+
+		f = function_entry_address(s->func);
+
+		t = vm_map((unsigned long)&opal_branch_table[s->token], sizeof(*t), true);
+
+		*t = f;
+		vm_unmap((unsigned long)&opal_branch_table[s->token], sizeof(*t));
+
 		opal_num_args[s->token] = s->nargs;
 		s++;
 	}
@@ -331,9 +340,16 @@ opal_call(OPAL_QUIESCE, opal_quiesce, 2);
 
 void __opal_register(uint64_t token, void *func, unsigned int nargs)
 {
+	uint64_t f;
+	uint64_t *t;
+
 	assert(token <= OPAL_LAST);
 
-	opal_branch_table[token] = function_entry_address(func);
+	f = function_entry_address(func);
+
+	t = vm_map((unsigned long)&opal_branch_table[token], sizeof(uint64_t), true);
+	*t = f;
+	vm_unmap((unsigned long)&opal_branch_table[token], sizeof(uint64_t));
 	opal_num_args[token] = nargs;
 }
 
diff --git a/core/vm.c b/core/vm.c
new file mode 100644
index 000000000..1bf5e4bd8
--- /dev/null
+++ b/core/vm.c
@@ -0,0 +1,812 @@
+/* Copyright 2018 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ccan/container_of/container_of.h>
+#include <ccan/list/list.h>
+#include <ccan/str/str.h>
+#include <cmpxchg.h>
+#include <cpu.h>
+#include <opal.h>
+#include <skiboot.h>
+#include <stack.h>
+#include <timebase.h>
+#include <trace.h>
+
+static bool vm_setup = false;
+static bool vm_globals_allocated = false;
+
+#define SLB_SZ		(256UL*1024*1024)
+#define SLB_NR		32
+#define LOCAL_SLB_NR	2
+#define GLOBAL_SLB_NR	(SLB_NR - LOCAL_SLB_NR)
+#define LOCAL_SLB_BASE	GLOBAL_SLB_NR
+
+#define LOCAL_EA_BEGIN	0x0800000000000000ULL
+#define LOCAL_EA_END	0x0900000000000000ULL
+
+static void __nomcount slb_install(unsigned long esid, unsigned long vsid, unsigned int index)
+{
+	unsigned long rs;
+	unsigned long rb;
+
+	rs = vsid << (63-51);		/* 256MB VSID */
+	rs |= 1UL << (63-53);		/* Kp = 1 */
+
+	rb = esid << (63-35);		/* 256MB ESID */
+	rb |= 1UL << (63-36);		/* V = 1 */
+	rb |= index;
+
+	asm volatile("slbmte %0,%1" : : "r"(rs), "r"(rb) : "memory");
+}
+
+#if 0
+static void slb_remove(unsigned long esid)
+{
+	asm volatile("isync ; slbie %0 ; isync" : : "r"(esid << 28) : "memory");
+}
+#endif
+
+static void slb_remove_all(void)
+{
+	asm volatile("isync ; slbmte %0,%0 ; slbia ; isync" : : "r"(0) : "memory");
+}
+
+static void __nomcount slb_add(unsigned long ea)
+{
+	struct cpu_thread *cpu = this_cpu();
+	uint64_t esid = ea >> 28;
+	uint64_t vsid = ea >> 28;
+
+	slb_install(esid, vsid, cpu->vm_slb_rr);
+
+	cpu->vm_slb_rr++;
+	if (cpu->vm_slb_rr == GLOBAL_SLB_NR)
+		cpu->vm_slb_rr = 0;
+}
+
+struct hpte {
+	uint64_t dword[2];
+};
+
+struct hpteg {
+	struct hpte hpte[8];
+};
+
+static struct hpteg *htab;
+static unsigned long htab_shift;
+static unsigned long htab_pteg_mask;
+
+static struct lock htab_lock;
+
+static void __nomcount htab_install(unsigned long va, unsigned long pa, int rw, int ex, int ci, bool local)
+{
+	unsigned long hash;
+	struct hpteg *hpteg;
+	struct hpte *hpte;
+	unsigned long ava = va >> 23;
+	unsigned long arpn = pa >> 12;
+	unsigned long dw0, dw1;
+	unsigned long _dw0;
+	unsigned long _ava;
+	unsigned int hstart, hend;
+	unsigned int i;
+
+	dw0 = ava << (63-56); /* AVA = ava */
+	dw0 |= 0x1; /* V = 1 */
+	if (local)
+		dw0 |= 0x8; /* SW[0] = 1 */
+
+	dw1 = (arpn << (63-43 - 8)); /* ARPN||LP = arpn */
+	if (!rw)
+		dw1 |= (1UL << (63 - 0)) | (1UL << (63 - 63 + 1)); /* pp = 110 */
+	if (!ex)
+		dw1 |= (1UL << (63 - 61)); /* N = 1 */
+	dw1 |= (1UL << (63 - 60 + 1)); /* WIMG = 0010 */
+	if (ci)
+		dw1 |= (1UL << (63 - 60)) | (1UL << (63 - 60 + 2)); /* WIMG = 0111 */
+	dw1 |= (1UL << (63 - 55)) | (1UL << (63 - 56)); /* R=C=1 */
+
+	hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
+	hpteg = &htab[hash & htab_pteg_mask];
+
+	lock(&htab_lock);
+
+	hstart = 0;
+	hend = 7;
+
+	for (i = hstart; i <= hend; i++) {
+		hpte = &hpteg->hpte[i];
+
+		_dw0 = be64_to_cpu(hpte->dword[0]);
+		if (_dw0 & 1) {
+			_ava = _dw0 >> (63 - 56);
+			if (_ava == ava) {
+				/* Replace insertion */
+				goto install;
+			}
+
+			continue;
+		}
+
+		assert(!_dw0);
+		goto install;
+	}
+
+	i = mftb();
+	i = (i ^ (i >> 4)) & 0x7;
+	hpte = &hpteg->hpte[i];
+
+install:
+	hpte->dword[0] = 0;
+	eieio();
+	hpte->dword[1] = cpu_to_be64(dw1);
+	eieio();
+	hpte->dword[0] = cpu_to_be64(dw0);
+	asm volatile("ptesync" ::: "memory");
+	unlock(&htab_lock);
+}
+
+static void htab_remove(unsigned long va, int local)
+{
+	unsigned long hash;
+	struct hpteg *hpteg;
+	unsigned long ava = va >> 23;
+	unsigned long dw0;
+	unsigned int hstart, hend;
+	unsigned int i;
+
+	dw0 = ava << (63-56);
+	dw0 |= 0x1;
+	if (local)
+		dw0 |= 0x8;
+
+	hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
+	hpteg = &htab[hash & htab_pteg_mask];
+
+	if (!local)
+		lock(&htab_lock);
+again:
+	hstart = 0;
+	hend = 7;
+
+	for (i = hstart; i <= hend; i++) {
+		struct hpte *hpte = &hpteg->hpte[i];
+		unsigned long _raw_dw0, _dw0;
+
+		_raw_dw0 = hpte->dword[0];
+		_dw0 = be64_to_cpu(_raw_dw0);
+
+		if (!(_dw0 & 1)) {
+			assert(!_raw_dw0);
+			continue;
+		}
+
+		if (_dw0 != dw0) {
+			assert(_dw0 >> 7 != ava);
+			continue;
+		}
+
+		if (local) {
+			if (__cmpxchg64(&hpte->dword[0], _raw_dw0, 0) != _raw_dw0)
+				goto again;
+		} else {
+			hpte->dword[0] = 0;
+		}
+
+		break;
+	}
+
+	if (local) {
+		asm volatile("ptesync" ::: "memory");
+		asm volatile("tlbiel %0" : : "r"(va & ~0xfffULL));
+		asm volatile("ptesync" ::: "memory");
+	} else {
+		unlock(&htab_lock);
+		asm volatile("ptesync" ::: "memory");
+		asm volatile("tlbie %0,%1" : : "r"(va & ~0xfffULL), "r"(0));
+		asm volatile("eieio ; tlbsync ; ptesync" ::: "memory");
+	}
+}
+
+/*
+ * Try to fix problems in callers if !strict.
+ */
+static bool vm_strict = false;
+
+static struct list_head vm_maps = LIST_HEAD_INIT(vm_maps);
+static struct lock vm_maps_lock;
+static unsigned long nr_vm_maps;
+
+static void __vm_map(const char *name, unsigned long addr, unsigned long len, unsigned long pa, bool r, bool w, bool x, bool ci, bool local)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	struct vm_map *new;
+	struct vm_map *vmm;
+
+	if (local) {
+		new = &c->vm_local_map;
+		new->name = name;
+		new->address = addr;
+		new->length = len;
+		new->pa = pa;
+		new->readable = r;
+		new->writeable = w;
+		new->executable = x;
+		new->ci = ci;
+
+		return;
+	}
+
+	new = zalloc(sizeof(*new));
+	assert(new);
+
+	new->name = name;
+	new->address = addr;
+	new->length = len;
+	new->pa = pa;
+	new->readable = r;
+	new->writeable = w;
+	new->executable = x;
+	new->ci = ci;
+
+	/* Can not take a d-side fault while holding this lock */
+	if (vm_setup)
+		vm_exit();
+	lock(&vm_maps_lock);
+
+	list_for_each(&vm_maps, vmm, list) {
+		if (addr >= vmm->address + vmm->length)
+			continue;
+		if (addr + len <= vmm->address) {
+			list_add_before(&vm_maps, &new->list, &vmm->list);
+			goto found;
+		}
+
+		if (!vm_strict) {
+			printf("vm_map_global %s %lx-%lx collided with vmm:%s %llx-%llx\n", name, addr, addr + len, vmm->name, vmm->address, vmm->address + vmm->length);
+			list_add_before(&vm_maps, &new->list, &vmm->list);
+			goto found;
+		}
+		assert(0);
+	}
+	list_add_tail(&vm_maps, &new->list);
+found:
+	nr_vm_maps++;
+	unlock(&vm_maps_lock);
+	if (vm_setup)
+		vm_enter();
+}
+
+static void __vm_unmap(unsigned long addr, unsigned long len, bool local)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	unsigned long end = addr + len;
+	struct vm_map *vmm;
+
+	if (local) {
+		vmm = &c->vm_local_map;
+		assert(addr == vmm->address);
+		assert(len == vmm->length);
+		memset(vmm, 0, sizeof(struct vm_map));
+
+		if (vm_setup) {
+			while (addr < end) {
+				htab_remove(addr, local);
+				addr += PAGE_SIZE;
+			}
+		}
+
+		return;
+	}
+
+	/* Can not take a d-side fault while holding this lock */
+	if (vm_setup)
+		vm_exit();
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		if (addr != vmm->address)
+			continue;
+		if (len != vmm->length)
+			continue;
+		goto found;
+	}
+	vmm = NULL;
+	unlock(&vm_maps_lock);
+	if (!vm_strict) {
+		printf("unmap didn't find anything\n");
+		backtrace();
+		goto out;
+	}
+	assert(0);
+
+found:
+	list_del(&vmm->list);
+
+	if (vm_setup) {
+		while (addr < end) {
+			htab_remove(addr, local);
+			addr += PAGE_SIZE;
+		}
+	}
+
+	nr_vm_maps--;
+	unlock(&vm_maps_lock);
+out:
+	if (vm_setup)
+		vm_enter();
+
+	if (vmm)
+		free(vmm);
+}
+
+
+void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci)
+{
+	__vm_map(name, addr, len, addr, true, rw, false, ci, false);
+}
+
+static void vm_map_global_text(const char *name, unsigned long addr, unsigned long len)
+{
+	__vm_map(name, addr, len, addr, true, false, true, false, false);
+}
+
+void vm_unmap_global(unsigned long addr, unsigned long len)
+{
+	__vm_unmap(addr, len, false);
+}
+
+
+void *vm_map(unsigned long addr, unsigned long len, bool rw)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long newaddr = (LOCAL_EA_BEGIN + ((unsigned long)c->pir << 30));
+	unsigned long end = addr + len;
+	unsigned long offset = addr & (PAGE_SIZE - 1);
+
+	/* Can't do nested mappings */
+	assert(!c->vm_local_map_inuse);
+	c->vm_local_map_inuse = true;
+
+	if (!c->vm_setup)
+		return (void *)addr;
+
+	end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	assert(len < (1 << 28)); /* same segment */
+
+	__vm_map("local", newaddr, len, addr, true, rw, false, false, true);
+
+	return (void *)newaddr + offset;
+}
+
+void vm_unmap(unsigned long addr, unsigned long len)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long newaddr = (LOCAL_EA_BEGIN + ((unsigned long)c->pir << 30));
+	unsigned long end = addr + len;
+
+	assert(c->vm_local_map_inuse);
+	c->vm_local_map_inuse = false;
+
+	if (!c->vm_setup)
+		return;
+
+	end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	assert(len < (1 << 28)); /* same segment */
+
+	__vm_unmap(newaddr, len, true);
+}
+
+struct prte {
+	unsigned long dword[2];
+};
+
+static struct prte *prtab;
+
+static void vm_init_cpu(void)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long esid = (LOCAL_EA_BEGIN + ((unsigned long)c->pir << 30)) >> 28;
+	unsigned long vsid = (LOCAL_EA_BEGIN + ((unsigned long)c->pir << 30)) >> 28;
+
+	mtspr(SPR_LPCR, mfspr(SPR_LPCR) &
+		~(PPC_BITMASK(0,3) | PPC_BIT(41) | PPC_BIT(43) | PPC_BIT(54)));
+	mtspr(SPR_LPID, 0);
+	mtspr(SPR_PID, 0);
+	mtspr(SPR_HRMOR, 0);
+	mtspr(SPR_PTCR, (unsigned long)prtab);
+	mtspr(SPR_AMR, 0);
+	mtspr(SPR_IAMR, 0);
+	mtspr(SPR_AMOR, 0);
+	mtspr(SPR_UAMOR, 0);
+
+	slb_remove_all();
+	slb_install(esid, vsid, LOCAL_SLB_BASE);
+}
+
+void vm_init_secondary(void)
+{
+	vm_init_cpu();
+	vm_enter();
+}
+
+bool vm_realmode(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	return !vm_setup || !c->vm_setup;
+}
+
+void vm_enter(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	assert(vm_setup);
+	if (c->vm_setup) {
+		mtmsr(mfmsr() | (MSR_IR|MSR_DR));
+		printf("CPU:%d vm_enter already entered\n", c->pir);
+		backtrace();
+		return;
+	}
+	c->vm_setup = true;
+	mtmsr(mfmsr() | (MSR_IR|MSR_DR));
+}
+
+void vm_exit(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	assert(vm_setup);
+	if (!c->vm_setup) {
+		mtmsr(mfmsr() & ~(MSR_IR|MSR_DR));
+		printf("CPU:%d vm_exit already exited\n", c->pir);
+		backtrace();
+		return;
+	}
+	c->vm_setup = false;
+	mtmsr(mfmsr() & ~(MSR_IR|MSR_DR));
+}
+
+bool __nomcount vm_dslb(uint64_t nia, uint64_t dar)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+
+	assert(vm_setup);
+	c->vm_setup = false;
+
+	/*
+	 * Per-cpu map ranges are bolted to per-cpu SLBs.
+	 */
+	assert((dar < LOCAL_EA_BEGIN) ||
+		(dar >= LOCAL_EA_END));
+
+	(void)nia;
+	slb_add(dar);
+
+	c->vm_setup = true;
+
+	return true;
+}
+
+bool __nomcount vm_islb(uint64_t nia)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+
+	assert(vm_setup);
+	c->vm_setup = false;
+
+	slb_add(nia);
+
+	c->vm_setup = true;
+
+	return true;
+}
+
+bool __nomcount vm_dsi(uint64_t nia, uint64_t dar, bool store)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	struct vm_map *vmm;
+	uint64_t pa;
+	bool ret = true;
+	bool local;
+
+	(void)nia;
+
+	assert(vm_setup);
+	c->vm_setup = false;
+
+	if ((dar >= LOCAL_EA_BEGIN) && (dar < LOCAL_EA_END)) {
+		local = true;
+		vmm = &c->vm_local_map;
+		if (dar >= vmm->address && dar < vmm->address + vmm->length)
+			goto found;
+		goto not_found;
+	}
+
+	local = false;
+
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		assert(vmm->pa == vmm->address);
+		if (dar >= vmm->address && dar < vmm->address + vmm->length)
+			goto found;
+	}
+	if (!vm_strict) {
+		if (dar >= 0x0006000000000000 && dar < 0x0007000000000000)
+			/* MMIO */
+			htab_install(dar, dar, 1, 0, 1, false);
+		else if (dar < LOCAL_EA_BEGIN)
+			htab_install(dar, dar, 1, 0, 0, false);
+		else
+			ret = false;
+		unlock(&vm_maps_lock);
+		printf("Page fault with no VMM at NIA:0x%016llx DAR:0x%016llx, store:%d\n", nia, dar, store);
+		backtrace();
+		goto out;
+	}
+	unlock(&vm_maps_lock);
+not_found:
+	printf("  vmm not found\n");
+	ret = false;
+	assert(0);
+	goto out;
+
+found:
+	pa = vmm->pa + (dar & ~(PAGE_SIZE - 1)) - vmm->address;
+	if (!vmm->readable) {
+		unlock(&vm_maps_lock);
+		printf("  vmm not readable\n");
+		ret = false;
+		assert(0);
+		goto out;
+	}
+	if (store && !vmm->writeable) {
+		if (!vm_strict) {
+			htab_install(dar, pa, store, 0, vmm->ci, local);
+			unlock(&vm_maps_lock);
+			printf("Page fault store to RO VMM:%s at NIA:0x%016llx DAR:0x%016llx\n", vmm->name, nia, dar);
+			backtrace();
+			goto out;
+		}
+		unlock(&vm_maps_lock);
+		printf("  vmm not writeable\n");
+		ret = false;
+		assert(0);
+		goto out;
+	}
+
+	htab_install(dar, pa, vmm->writeable, vmm->executable, vmm->ci, local);
+	if (!local)
+		unlock(&vm_maps_lock);
+
+out:
+	c->vm_setup = true;
+	return ret;
+}
+
+bool __nomcount vm_isi(uint64_t nia)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+
+	assert(vm_setup);
+
+	if (nia < (unsigned long)_stext)
+		return false;
+	if (nia >= (unsigned long)_etext)
+		return false;
+
+	c->vm_setup = false;
+	htab_install(nia, nia, 0, 1, 0, false);
+	c->vm_setup = true;
+
+	return true;
+}
+
+static void cpu_stop_vm(void *arg __unused)
+{
+	vm_exit();
+}
+
+static void cpu_cleanup_vm(void *arg __unused)
+{
+	slb_remove_all();
+	mtspr(SPR_PTCR, 0);
+}
+
+static void cpu_all_destroy_vm(void)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job **jobs;
+
+	jobs = zalloc(sizeof(struct cpu_job *) * cpu_max_pir + 1);
+	assert(jobs);
+
+	/* Stop all CPUs */
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_stop_vm",
+						cpu_stop_vm, NULL);
+	}
+
+	/* this cpu */
+	cpu_stop_vm(NULL);
+
+	/* Cleaup after all stop */
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_cleanup_vm",
+						cpu_cleanup_vm, NULL);
+	}
+
+	/* this cpu */
+	cpu_cleanup_vm(NULL);
+
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	free(jobs);
+}
+
+void vm_init(void)
+{
+	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
+	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
+	unsigned long htab_nr_bytes;
+	unsigned long htab_nr_ptegs;
+
+	prtab = memalign(64*1024, 64*1024);
+	assert(prtab);
+	memset(prtab, 0, 64*1024);
+
+	htab_shift = 18;
+	htab_nr_bytes = 1UL << htab_shift;
+	htab_nr_ptegs = htab_nr_bytes / sizeof(struct hpteg);
+	htab_pteg_mask = htab_nr_ptegs - 1;
+	htab = memalign(1UL << htab_shift, htab_nr_bytes);
+	assert(htab);
+	memset(htab, 0, htab_nr_bytes);
+
+	prtab[0].dword[0] = cpu_to_be64((unsigned long)htab | (htab_shift - 18));
+	prtab[0].dword[1] = 0;
+
+	eieio();
+
+	vm_init_cpu();
+
+	cleanup_global_tlb();
+
+	if (vm_globals_allocated)
+		goto done;
+
+	vm_map_global_text("OPAL text", (unsigned long)_stext,
+		(unsigned long)_etext - (unsigned long)_stext);
+	vm_map_global("OPAL rodata", (unsigned long)__rodata_start,
+		(unsigned long)__rodata_end - (unsigned long)__rodata_start,
+		false, false);
+	vm_map_global("OPAL data", (unsigned long)_sdata,
+		(unsigned long)_edata - (unsigned long)_sdata,
+		true, false);
+	vm_map_global("OPAL bss", (unsigned long)_sbss,
+		(unsigned long)_ebss - (unsigned long)_sbss,
+		true, false);
+	vm_map_global("OPAL sym map", (unsigned long)__sym_map_start,
+		(unsigned long)__sym_map_end - (unsigned long)__sym_map_start,
+		false, false);
+	vm_map_global("OPAL heap", HEAP_BASE, HEAP_SIZE, true, false);
+	vm_map_global("Memory console", INMEM_CON_START, INMEM_CON_LEN, true, false);
+	vm_map_global("Hostboot console", HBRT_CON_START, HBRT_CON_LEN, false, false);
+	vm_map_global("SPIRA heap", SPIRA_HEAP_BASE, SPIRA_HEAP_SIZE, false, false);
+	vm_map_global("PSI TCE table", PSI_TCE_TABLE_BASE, PSI_TCE_TABLE_SIZE_P8, false, false);
+	vm_map_global("OPAL boot stacks", stack_start, stack_end - stack_start, true, false);
+	vm_globals_allocated = true;
+
+done:
+	if (1) {
+		struct vm_map *vmm;
+		printf("VMM: SETUP\n");
+		printf(" PRTAB:%p\n", prtab);
+		printf(" HTAB: %p\n", htab);
+		printf(" Global mappings\n");
+		list_for_each(&vm_maps, vmm, list)
+			printf("%28s 0x%08llx-0x%08llx\n", vmm->name,
+				vmm->address, vmm->address + vmm->length);
+	}
+
+	vm_setup = true;
+
+	vm_enter();
+}
+
+void vm_init_stacks(void)
+{
+	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
+	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
+	struct cpu_thread *c = this_cpu();
+	struct vm_map *vmm;
+
+	/* Can not take a d-side fault while holdig this lock */
+	if (c->vm_setup)
+		mtmsr(mfmsr() & ~MSR_DR);
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		if (vmm->address >= stack_end)
+			continue;
+		if (vmm->address + vmm->length <= stack_start)
+			continue;
+		goto found;
+	}
+	unlock(&vm_maps_lock);
+	assert(0);
+
+found:
+	vmm->name = "OPAL stacks";
+	vmm->address = stack_start;
+	vmm->length = stack_end - stack_start;
+	unlock(&vm_maps_lock);
+	if (c->vm_setup)
+		mtmsr(mfmsr() | MSR_DR);
+}
+
+void vm_destroy(void)
+{
+	assert(vm_setup);
+
+	if (1) {
+		struct vm_map *vmm;
+		printf("VMM: TEARDOWN\n");
+		printf(" Global mappings\n");
+		list_for_each(&vm_maps, vmm, list)
+			printf("%28s 0x%08llx-0x%08llx\n", vmm->name,
+				vmm->address, vmm->address + vmm->length);
+	}
+
+	cpu_all_destroy_vm();
+
+	vm_setup = false;
+
+	if (0) { /* XXX: leave for VMM enabled fast-reboot */
+		while (!list_empty(&vm_maps)) {
+			struct vm_map *vmm;
+			vmm = list_pop(&vm_maps, struct vm_map, list);
+			free(vmm);
+		}
+	}
+
+	free(htab);
+	htab = NULL;
+	free(prtab);
+	prtab = NULL;
+}
diff --git a/hdata/spira.c b/hdata/spira.c
index 6891a9c71..743aecfd6 100644
--- a/hdata/spira.c
+++ b/hdata/spira.c
@@ -1578,11 +1578,18 @@ static void fixup_spira(void)
 
 int parse_hdat(bool is_opal)
 {
+	int ret = 0;
+
 	cpu_type = PVR_TYPE(mfspr(SPR_PVR));
 
 	prlog(PR_DEBUG, "Parsing HDAT...\n");
 
+	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), true, false);
 	fixup_spira();
+	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
+
+	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), false, false);
+	vm_map_global("SPIRA-H", SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah), false, false);
 
 	/*
 	 * Basic DT root stuff
@@ -1603,9 +1610,12 @@ int parse_hdat(bool is_opal)
 	dt_init_led_node();
 
 	/* Parse SPPACA and/or PCIA */
-	if (!pcia_parse())
-		if (paca_parse() < 0)
-			return -1;
+	if (!pcia_parse()) {
+		if (paca_parse() < 0) {
+			ret = -1;
+			goto out;
+		}
+	}
 
 	/* IPL params */
 	add_iplparams();
@@ -1652,6 +1662,9 @@ int parse_hdat(bool is_opal)
 		node_stb_parse();
 
 	prlog(PR_DEBUG, "Parsing HDAT...done\n");
+out:
+	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
+	vm_unmap_global(SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah));
 
-	return 0;
+	return ret;
 }
diff --git a/hw/fake-nvram.c b/hw/fake-nvram.c
index 236ad5b91..97f3f31ec 100644
--- a/hw/fake-nvram.c
+++ b/hw/fake-nvram.c
@@ -36,12 +36,16 @@ int fake_nvram_info(uint32_t *total_size)
 
 int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
 {
+	void *t;
+
 	if (!nvram_region)
 		return -ENODEV;
 
+	t = vm_map(nvram_region->start + src, len, false);
 	lock(&fake_nvram_lock);
-	memcpy(dst, (void *) (nvram_region->start + src), len);
+	memcpy(dst, t, len);
 	unlock(&fake_nvram_lock);
+	vm_unmap(nvram_region->start + src, len);
 
 	nvram_read_complete(true);
 
@@ -50,12 +54,16 @@ int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
 
 int fake_nvram_write(uint32_t offset, void *src, uint32_t size)
 {
+	void *t;
+
 	if (!nvram_region)
 		return OPAL_HARDWARE;
 
+	t = vm_map(nvram_region->start + offset, size, true);
 	lock(&fake_nvram_lock);
-	memcpy((void *) (nvram_region->start + offset), src, size);
+	memcpy(t, src, size);
 	unlock(&fake_nvram_lock);
+	vm_unmap(nvram_region->start + offset, size);
 
 	return 0;
 }
diff --git a/hw/homer.c b/hw/homer.c
index 34ee3370d..6b51da59c 100644
--- a/hw/homer.c
+++ b/hw/homer.c
@@ -121,6 +121,9 @@ static void homer_init_chip(struct proc_chip *chip)
 
 		chip->homer_base = hbase;
 		chip->homer_size = hsize;
+		/* slw late init and xive late init want to write to HOMER */
+		/* XXX: make it read only until then? */
+		vm_map_global("HOMER Image", hbase, hsize, true, false);
 	}
 
 	/*
@@ -147,6 +150,7 @@ static void homer_init_chip(struct proc_chip *chip)
 		chip->slw_base = sbase;
 		chip->slw_bar_size = ssize;
 		chip->slw_image_size = ssize; /* will be adjusted later */
+		/* XXX */
 	}
 
 	if (read_pba_bar(chip, bar_occ_common, &obase, &osize)) {
@@ -154,6 +158,7 @@ static void homer_init_chip(struct proc_chip *chip)
 		      obase, osize / 0x100000);
 		chip->occ_common_base = obase;
 		chip->occ_common_size = osize;
+		vm_map_global("OCC Common Area", obase, osize, false, false);
 	}
 }
 
diff --git a/hw/lpc-uart.c b/hw/lpc-uart.c
index bca10e0e9..9e89050fb 100644
--- a/hw/lpc-uart.c
+++ b/hw/lpc-uart.c
@@ -600,6 +600,8 @@ void early_uart_init(void)
 	if (!mmio_uart_base)
 		return;
 
+	vm_map_global("UART MMIO", (unsigned long)mmio_uart_base, 8, true, true);
+
 	clk = dt_prop_get_u32(uart_node, "clock-frequency");
 	baud = dt_prop_get_u32(uart_node, "current-speed");
 
@@ -608,6 +610,7 @@ void early_uart_init(void)
 		prlog(PR_DEBUG, "UART: Using UART at %p\n", mmio_uart_base);
 	} else {
 		prerror("UART: Early init failed!");
+		vm_unmap_global((unsigned long)mmio_uart_base, 8);
 		mmio_uart_base = NULL;
 	}
 }
@@ -619,9 +622,6 @@ void uart_init(void)
 	char *path __unused;
 	const uint32_t *irqp;
 
-	/* Clean up after early_uart_init() */
-	mmio_uart_base = NULL;
-
 	/* UART lock is in the console path and thus must block
 	 * printf re-entrancy
 	 */
@@ -639,13 +639,28 @@ void uart_init(void)
 	 * directly mapped UARTs in simulation environments
 	 */
 	if (n->parent == dt_root) {
+		void *base;
+
 		printf("UART: Found at root !\n");
-		mmio_uart_base = (void *)dt_translate_address(n, 0, NULL);
-		if (!mmio_uart_base) {
+
+		base = (void *)dt_translate_address(n, 0, NULL);
+		if (!base) {
 			printf("UART: Failed to translate address !\n");
 			return;
 		}
 
+		if (mmio_uart_base != base) {
+			void *old;
+
+			vm_map_global("UART MMIO", (unsigned long)base, 8, true, true);
+			old = mmio_uart_base;
+			mmio_uart_base = base;
+
+			/* Clean up after early_uart_init() */
+			if (old)
+				vm_unmap_global((unsigned long)old, 8);
+		}
+
 		/* If it has an interrupt properly, we consider this to be
 		 * a direct XICS/XIVE interrupt
 		 */
@@ -674,6 +689,12 @@ void uart_init(void)
 			lpc_irq = be32_to_cpu(*irqp);
 			prlog(PR_DEBUG, "UART: Using LPC IRQ %d\n", lpc_irq);
 		}
+
+		/* Clean up after early_uart_init() */
+		if (mmio_uart_base) {
+			vm_unmap_global((unsigned long)mmio_uart_base, 8);
+			mmio_uart_base = NULL;
+		}
 	}
 
 
diff --git a/hw/lpc.c b/hw/lpc.c
index 3f5109d73..d040e4136 100644
--- a/hw/lpc.c
+++ b/hw/lpc.c
@@ -1259,6 +1259,8 @@ static void lpc_init_chip_p9(struct dt_node *opb_node)
 	if (!lpc_node)
 		return;
 
+	vm_map_global("LPC MMIO", addr, 0x100000000UL, true, true);
+
 	lpc = zalloc(sizeof(struct lpcm));
 	assert(lpc);
 	lpc->chip_id = gcid;
diff --git a/hw/phb4.c b/hw/phb4.c
index 9a38dc752..79037d767 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -5773,6 +5773,7 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	uint64_t val, phb_bar = 0, irq_bar = 0, bar_en;
 	uint64_t mmio0_bar = 0, mmio0_bmask, mmio0_sz;
 	uint64_t mmio1_bar = 0, mmio1_bmask, mmio1_sz;
+	uint64_t bar_sz;
 	uint64_t reg[4];
 	void *foo;
 	uint64_t mmio_win[4];
@@ -5802,7 +5803,8 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	bar_en = 0;
 
 	/* Initialize PHB register BAR */
-	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, NULL);
+	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, &bar_sz);
+	vm_map_global("PHB REGS", phb_bar, bar_sz, true, true);
 	rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
 			 phb_bar << 8);
 
@@ -5816,18 +5818,21 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
 
 	/* Same with INT BAR (ESB) */
-	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, NULL);
+	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, &bar_sz);
+	vm_map_global("PHB IRQ", irq_bar, bar_sz, true, true);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_IRQ_BAR, irq_bar << 8);
 	bar_en |= XPEC_NEST_STK_BAR_EN_INT;
 
 
 	/* Same with MMIO windows */
 	phys_map_get(gcid, PHB4_64BIT_MMIO, phb_num, &mmio0_bar, &mmio0_sz);
+	vm_map_global("PHB MMIO0", mmio0_bar, mmio0_sz, true, true);
 	mmio0_bmask =  (~(mmio0_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0, mmio0_bar << 8);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0_MASK, mmio0_bmask << 8);
 
 	phys_map_get(gcid, PHB4_32BIT_MMIO, phb_num, &mmio1_bar, &mmio1_sz);
+	vm_map_global("PHB MMIO1", mmio1_bar, mmio1_sz, true, true);
 	mmio1_bmask =  (~(mmio1_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1, mmio1_bar << 8);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1_MASK, mmio1_bmask << 8);
diff --git a/hw/psi.c b/hw/psi.c
index 5435c4655..74f497801 100644
--- a/hw/psi.c
+++ b/hw/psi.c
@@ -964,6 +964,8 @@ static bool psi_init_psihb(struct dt_node *psihb)
 
 	list_add(&psis, &psi->list);
 
+	vm_map_global("PSI", (unsigned long)psi->regs, 0x100, true, true);
+
 	val = in_be64(psi->regs + PSIHB_CR);
 	if (val & PSIHB_CR_FSP_LINK_ACTIVE) {
 		lock(&psi_lock);
diff --git a/hw/slw.c b/hw/slw.c
index c872b630b..9ddb5393e 100644
--- a/hw/slw.c
+++ b/hw/slw.c
@@ -166,7 +166,7 @@ static void slw_patch_reset(void)
 		*(sav++) = *(dst);
 		*(dst++) = *(src++);
 	}
-	sync_icache();
+	sync_icache(0);
 }
 
 static void slw_unpatch_reset(void)
@@ -182,7 +182,7 @@ static void slw_unpatch_reset(void)
 		*(dst++) = *(sav++);
 		src++;
 	}
-	sync_icache();
+	sync_icache(0);
 }
 
 static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
diff --git a/hw/xive.c b/hw/xive.c
index a9f1e7707..ec0d1f5b3 100644
--- a/hw/xive.c
+++ b/hw/xive.c
@@ -1621,6 +1621,7 @@ static bool xive_configure_bars(struct xive *x)
 
 	/* IC BAR */
 	phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size);
+	vm_map_global("XIVE IC", (unsigned long)x->ic_base, x->ic_size, true, true);
 	val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID;
 	if (IC_PAGE_SIZE == 0x10000) {
 		val |= CQ_IC_BAR_64K;
@@ -1636,6 +1637,8 @@ static bool xive_configure_bars(struct xive *x)
 	 * all phys_map_get(XIVE_TM) calls.
 	 */
 	phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size);
+	if (chip_id == 0)
+		vm_map_global("XIVE TM", (unsigned long)x->tm_base, x->tm_size, true, true);
 	val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID;
 	if (TM_PAGE_SIZE == 0x10000) {
 		x->tm_shift = 16;
@@ -1651,6 +1654,7 @@ static bool xive_configure_bars(struct xive *x)
 
 	/* PC BAR. Clear first, write mask, then write value */
 	phys_map_get(chip_id, XIVE_PC, 0, (uint64_t *)&x->pc_base, &x->pc_size);
+	vm_map_global("XIVE PC", (unsigned long)x->pc_base, x->pc_size, true, true);
 	xive_regwx(x, CQ_PC_BAR, 0);
 	if (x->last_reg_error)
 		return false;
@@ -1665,6 +1669,7 @@ static bool xive_configure_bars(struct xive *x)
 
 	/* VC BAR. Clear first, write mask, then write value */
 	phys_map_get(chip_id, XIVE_VC, 0, (uint64_t *)&x->vc_base, &x->vc_size);
+	vm_map_global("XIVE VC", (unsigned long)x->vc_base, x->vc_size, true, true);
 	xive_regwx(x, CQ_VC_BAR, 0);
 	if (x->last_reg_error)
 		return false;
diff --git a/hw/xscom.c b/hw/xscom.c
index bfe51c22e..40cad2136 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -869,6 +869,8 @@ void xscom_init(void)
 		assert(reg);
 
 		chip->xscom_base = dt_translate_address(xn, 0, NULL);
+		/* XXX: how large is this window? */
+		vm_map_global("XSCOM MMIO", chip->xscom_base, 0x200000000UL, true, true);
 
 		/* Grab processor type and EC level */
 		xscom_init_chip_info(chip);
@@ -882,7 +884,7 @@ void xscom_init(void)
 		prlog(PR_NOTICE, "CHIP: Chip ID %04x type: %s DD%x.%x%d\n",
 		      gcid, chip_name, chip->ec_level >> 4,
 		      chip->ec_level & 0xf, chip->ec_rev);
-		prlog(PR_DEBUG, "XSCOM: Base address: 0x%llx\n", chip->xscom_base);
+		prlog(PR_NOTICE, "XSCOM: Base address: 0x%llx\n", chip->xscom_base);
 	}
 
 	/* Collect details to trigger xstop via XSCOM write */
diff --git a/include/cmpxchg.h b/include/cmpxchg.h
index 28911c08c..a46c9765b 100644
--- a/include/cmpxchg.h
+++ b/include/cmpxchg.h
@@ -18,6 +18,9 @@
 #define __CMPXCHG_H
 
 #ifndef __TEST__
+#include <stdint.h>
+#include <processor.h>
+
 /*
  * Bare cmpxchg, no barriers.
  */
diff --git a/include/cpu.h b/include/cpu.h
index 011b12bb9..7d1d35bc7 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -25,6 +25,19 @@
 #include <stack.h>
 #include <timer.h>
 
+struct vm_map {
+	struct list_node list;
+
+	const char *name;
+	uint64_t address;
+	uint64_t pa;
+	uint64_t length;
+	bool readable;
+	bool writeable;
+	bool executable;
+	bool ci;
+};
+
 /*
  * cpu_thread is our internal structure representing each
  * thread in the system
@@ -83,10 +96,19 @@ struct cpu_thread {
 	struct bt_entry			stack_bot_bt[CPU_BACKTRACE_SIZE];
 	struct bt_metadata		stack_bot_bt_metadata;
 #endif
+	/*
+	 * Per-thread VM parameters
+	 */
+	struct vm_map			vm_local_map; /* per-cpu map */
+	bool				vm_local_map_inuse;
+	uint8_t				vm_slb_rr; /* RR allocator */
+	bool				vm_setup; /* virtual memory is up */
+
 	struct lock			job_lock;
 	struct list_head		job_queue;
 	uint32_t			job_count;
 	bool				job_has_no_return;
+
 	/*
 	 * Per-core mask tracking for threads in HMI handler and
 	 * a cleanup done bit.
diff --git a/include/elf-abi.h b/include/elf-abi.h
index e8397f70a..18b5c3f07 100644
--- a/include/elf-abi.h
+++ b/include/elf-abi.h
@@ -34,7 +34,15 @@
 static inline uint64_t function_entry_address(void *func)
 {
 #ifdef ELF_ABI_v2
-	u32 *insn = func;
+	u32 *i;
+	u32 insn;
+	u32 insn2;
+
+	i = vm_map((unsigned long)func, sizeof(insn*2), false);
+	insn = *i;
+	insn2 = *(i+1);
+	vm_unmap((unsigned long)func, sizeof(insn*2));
+
 	/*
 	 * A PPC64 ABIv2 function may have a local and a global entry
 	 * point. We use the local entry point for branch tables called
@@ -51,12 +59,12 @@ static inline uint64_t function_entry_address(void *func)
 	 * lis   r2,XXXX
 	 * addi  r2,r2,XXXX
 	 */
-	if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
-	     ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
-	    ((*(insn+1) & OP_RT_RA_MASK) == ADDI_R2_R2))
-		return (uint64_t)(insn + 2);
+	if ((((insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
+	     ((insn & OP_RT_RA_MASK) == LIS_R2)) &&
+	    ((insn2 & OP_RT_RA_MASK) == ADDI_R2_R2))
+		return (uint64_t)(i + 2);
 	else
-		return (uint64_t)func;
+		return (uint64_t)i;
 #else
 	return *(uint64_t *)func;
 #endif
diff --git a/include/io.h b/include/io.h
index c056c37e4..cc8964049 100644
--- a/include/io.h
+++ b/include/io.h
@@ -20,6 +20,7 @@
 #ifndef __ASSEMBLY__
 
 #include <compiler.h>
+#include <skiboot.h>
 #include <stdint.h>
 #include <processor.h>
 #include <ccan/endian/endian.h>
@@ -35,8 +36,14 @@
 static inline uint8_t __in_8(const volatile uint8_t *addr)
 {
 	uint8_t val;
-	asm volatile("lbzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lbzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lbzx %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return val;
 }
 
@@ -49,8 +56,14 @@ static inline uint8_t in_8(const volatile uint8_t *addr)
 static inline uint16_t __in_be16(const volatile uint16_t *addr)
 {
 	uint16_t val;
-	asm volatile("lhzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lhzcix %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lhzx %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return val;
 }
 
@@ -68,8 +81,14 @@ static inline uint16_t in_le16(const volatile uint16_t *addr)
 static inline uint32_t __in_be32(const volatile uint32_t *addr)
 {
 	uint32_t val;
-	asm volatile("lwzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lwzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lwzx %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return val;
 }
 
@@ -87,8 +106,14 @@ static inline uint32_t in_le32(const volatile uint32_t *addr)
 static inline uint64_t __in_be64(const volatile uint64_t *addr)
 {
 	uint64_t val;
-	asm volatile("ldcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("ldcix %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("ldx %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return val;
 }
 
@@ -105,7 +130,11 @@ static inline uint64_t in_le64(const volatile uint64_t *addr)
 
 static inline void __out_8(volatile uint8_t *addr, uint8_t val)
 {
-	asm volatile("stbcix %0,0,%1"
+	if (vm_realmode())
+		asm volatile("stbcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stbx %0,0,%1"
 		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
@@ -117,7 +146,11 @@ static inline void out_8(volatile uint8_t *addr, uint8_t val)
 
 static inline void __out_be16(volatile uint16_t *addr, uint16_t val)
 {
-	asm volatile("sthcix %0,0,%1"
+	if (vm_realmode())
+		asm volatile("sthcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("sthx %0,0,%1"
 		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
@@ -134,7 +167,11 @@ static inline void out_le16(volatile uint16_t *addr, uint16_t val)
 
 static inline void __out_be32(volatile uint32_t *addr, uint32_t val)
 {
-	asm volatile("stwcix %0,0,%1"
+	if (vm_realmode())
+		asm volatile("stwcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stwx %0,0,%1"
 		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
@@ -151,7 +188,11 @@ static inline void out_le32(volatile uint32_t *addr, uint32_t val)
 
 static inline void __out_be64(volatile uint64_t *addr, uint64_t val)
 {
-	asm volatile("stdcix %0,0,%1"
+	if (vm_realmode())
+		asm volatile("stdcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stdx %0,0,%1"
 		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
diff --git a/include/mem_region.h b/include/mem_region.h
index d9e490af4..a18494d44 100644
--- a/include/mem_region.h
+++ b/include/mem_region.h
@@ -46,6 +46,7 @@ struct mem_region {
 	struct list_node list;
 	const char *name;
 	uint64_t start, len;
+	uint64_t vm_mapped_len;
 	struct dt_node *node;
 	enum mem_region_type type;
 	struct list_head free_list;
diff --git a/include/processor.h b/include/processor.h
index b759752b5..56d988189 100644
--- a/include/processor.h
+++ b/include/processor.h
@@ -52,7 +52,9 @@
 #define SPR_SRR1	0x01b	/* RW: Exception save/restore reg 1 */
 #define SPR_CFAR	0x01c	/* RW: Come From Address Register */
 #define SPR_AMR		0x01d	/* RW: Authority Mask Register */
+#define SPR_PID		0x030	/* RW: PID register */
 #define SPR_IAMR	0x03d	/* RW: Instruction Authority Mask Register */
+#define SPR_UAMOR	0x09d
 #define SPR_RPR		0x0ba   /* RW: Relative Priority Register */
 #define SPR_TBRL	0x10c	/* RO: Timebase low */
 #define SPR_TBRU	0x10d	/* RO: Timebase high */
@@ -74,10 +76,12 @@
 #define SPR_HSRR1	0x13b	/* RW: HV Exception save/restore reg 1 */
 #define SPR_TFMR	0x13d
 #define SPR_LPCR	0x13e
+#define SPR_LPID	0x13f	/* RW: LPID register */
 #define SPR_HMER	0x150	/* Hypervisor Maintenance Exception */
 #define SPR_HMEER	0x151	/* HMER interrupt enable mask */
 #define SPR_PCR		0x152
 #define SPR_AMOR	0x15d
+#define SPR_PTCR	0x1d0	/* RW: Partition table control register */
 #define SPR_PSSCR	0x357   /* RW: Stop status and control (ISA 3) */
 #define SPR_TSCR	0x399
 #define SPR_HID0	0x3f0
@@ -93,6 +97,11 @@
 #define SPR_SRR1_PM_WAKE_SRESET	0x100000
 #define SPR_SRR1_PM_WAKE_MCE	0x3c0000	/* Use reserved value for MCE */
 
+/* Bits in DSISR */
+
+#define	DSISR_ISSTORE		0x02000000
+
+
 /* Bits in LPCR */
 
 /* Powersave Exit Cause Enable is different on each generation */
@@ -322,9 +331,9 @@ static inline void isync(void)
 /*
  * Cache sync
  */
-static inline void sync_icache(void)
+static inline void sync_icache(unsigned long ptr)
 {
-	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (0) : "memory");
+	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (ptr) : "memory");
 }
 
 /*
diff --git a/include/skiboot.h b/include/skiboot.h
index 1b3bacbe7..98a69ef1d 100644
--- a/include/skiboot.h
+++ b/include/skiboot.h
@@ -56,8 +56,13 @@ extern char __sym_map_end[];
 extern char _romem_end[];
 
 #ifndef __TESTING__
+extern char _stext[], _etext[];
 /* Readonly section start and end. */
 extern char __rodata_start[], __rodata_end[];
+extern char _sdata[], _edata[];
+extern char __sym_map_start[], __sym_map_end[];
+extern char _sbss[], _ebss[];
+extern char _end[];
 
 static inline bool is_rodata(const void *p)
 {
@@ -191,6 +196,7 @@ extern void disable_fast_reboot(const char *reason);
 extern void add_fast_reboot_dt_entries(void);
 extern void fast_reboot(void);
 extern void __noreturn __secondary_cpu_entry(void);
+extern void __noreturn __return_cpu_entry(void);
 extern void __noreturn load_and_boot_kernel(bool is_reboot);
 extern void cleanup_local_tlb(void);
 extern void cleanup_global_tlb(void);
@@ -341,4 +347,25 @@ extern int fake_nvram_info(uint32_t *total_size);
 extern int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len);
 extern int fake_nvram_write(uint32_t offset, void *src, uint32_t size);
 
+/* core/vm.c */
+#define PAGE_SIZE 4096
+
+bool vm_realmode(void);
+void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci);
+void vm_unmap_global(unsigned long addr, unsigned long len);
+void *vm_map(unsigned long addr, unsigned long len, bool rw);
+void vm_unmap(unsigned long addr, unsigned long len);
+void vm_init(void);
+void vm_init_stacks(void);
+void vm_destroy(void);
+void vm_init_secondary(void);
+void vm_enter(void);
+void vm_exit(void);
+void vm_exit_cleanup(void);
+void vm_map_stacks(void);
+bool vm_dslb(uint64_t nia, uint64_t dar);
+bool vm_islb(uint64_t nia);
+bool vm_dsi(uint64_t nia, uint64_t dar, bool store);
+bool vm_isi(uint64_t nia);
+
 #endif /* __SKIBOOT_H */
diff --git a/libstb/container.c b/libstb/container.c
index a720fbbf1..aef169e1c 100644
--- a/libstb/container.c
+++ b/libstb/container.c
@@ -19,14 +19,20 @@
 
 bool stb_is_container(const void *buf, size_t size)
 {
+	uint32_t *t;
 	ROM_container_raw *c;
+	bool ret = true;;
 
 	c = (ROM_container_raw*) buf;
 	if (!buf || size < SECURE_BOOT_HEADERS_SIZE)
 		return false;
-	if (be32_to_cpu(c->magic_number) != ROM_MAGIC_NUMBER )
-		return false;
-	return true;
+
+	t = vm_map((unsigned long)&c->magic_number, sizeof(*t), false);
+	if (be32_to_cpu(*t) != ROM_MAGIC_NUMBER)
+		ret = false;
+	vm_unmap((unsigned long)&c->magic_number, sizeof(*t));
+
+	return ret;
 }
 
 uint32_t stb_payload_magic(const void *buf, size_t size)
diff --git a/skiboot.lds.S b/skiboot.lds.S
index 4a7727dc9..f157ebfc2 100644
--- a/skiboot.lds.S
+++ b/skiboot.lds.S
@@ -95,18 +95,33 @@ SECTIONS
 		KEEP(*(.cpuctrl.data))
 	}
 
+	/* Relocations */
 	. = ALIGN(0x10);
+	.dynamic : {
+		__dynamic_start = .;
+		*(.dynamic)
+		__dynamic_end = .;
+	}
+
+	. = ALIGN(0x10);
+	.rela.dyn : {
+		__rela_dyn_start = .;
+		*(.rela*)
+		__rela_dyn_end = .;
+	}
+
+	. = ALIGN(0x1000);
 	_stext = .;
  	.text : {
 		*(.text*)
 		*(.sfpr .glink)
 	}
 	_etext = .;
+	. = ALIGN(0x1000);
 
+	__rodata_start = .;
 	.rodata : {
-		__rodata_start = .;
 		*(.rodata .rodata.*)
-		__rodata_end = .;
 	}
 
 	. = ALIGN(0x10);
@@ -130,38 +145,21 @@ SECTIONS
 		*(.toc)
 	}
 
-	. = ALIGN(0x10);
-	.opal_table : {
-		__opal_table_start = .;
-		KEEP(*(.opal_table))
-		__opal_table_end = .;
-	}
-
 	.platforms : {
 		__platforms_start = .;
 		KEEP(*(.platforms))
 		__platforms_end = .;
 	}
 
-	/* Do I need to keep these ? */
-	.dynsym : { *(.dynsym)	}
-	.dynstr : { *(.dynstr)	}
-
-	/* Relocations */
 	. = ALIGN(0x10);
-	.dynamic : {
-		__dynamic_start = .;
-		*(.dynamic)
-		__dynamic_end = .;
+	.opal_table : {
+		__opal_table_start = .;
+		KEEP(*(.opal_table))
+		__opal_table_end = .;
 	}
+	__rodata_end = .;
 
-	. = ALIGN(0x10);
-	.rela.dyn : {
-		__rela_dyn_start = .;
-		*(.rela*)
-		__rela_dyn_end = .;
-	}
-	.plt    : { *(.plt) *(.iplt) }
+	. = ALIGN(0x1000);
 
 	.hash          : { *(.hash)   }
 	.gnu.hash      : { *(.gnu.hash) }
@@ -171,7 +169,6 @@ SECTIONS
 	.gnu.version_d : { *(.gnu.version_d) }
 	.gnu.version_r : { *(.gnu.version_r) }
 
-	. = ALIGN(0x10);
 	.sym_map : {
 		__sym_map_start = . ;
 		KEEP(*(.sym_map))
@@ -184,6 +181,9 @@ SECTIONS
 	 */
 	_romem_end = .;
 
+	. = ALIGN(0x1000);
+
+	_sdata = .;
 	.data : {
 		/*
 		 * A couple of things that need to be 4K aligned and
@@ -200,6 +200,10 @@ SECTIONS
 		*(.toc1)
 		*(.branch_lt)
 	}
+	.plt    : { *(.plt) *(.iplt) }
+	_edata = .;
+
+	. = ALIGN(0x1000);
 
 	/* We locate the BSS at 4M to leave room for the symbol map */
 	. = 0x400000;
-- 
2.20.1



More information about the Skiboot mailing list