[Skiboot] [RFC PATCH] Virtual Memory for OPAL boot

Wed Apr 29 19:49:53 AEST 2020

On 4/28/20 9:44 AM, Nicholas Piggin wrote:
> vm_map_global / vm_unmap_global sets up all-CPUs visible 1:1 mappings.
> vm_map / vm_unmap creates a per-cpu mapping, and which can not be nested.
> 
> A list of global extents + a local extent per cpu is kept to describe
> active mappings. Fault handlers look these up to install translations.
> 
> Booting with virtual memory is all well and good, and it can help find
> bugs. The bigger benefit is that a logical virtual map is created in
> the process, which can be given to the OS and used to create a virtual> memory environment for the OPAL runtime to execute in.

The goal is to turn OPAL into a kernel driver and the OPAL calls into 
simple function calls ?  

Thanks,

C.

> 
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
> ---
> - Countless fixes and improvements since last posted. This boots mambo
>   and a P9 witherspoon I have, and is the basis for later skiboot and
>   kernel patches which do actually make OPAL calls in virtual mode using
>   a specific mm context created for it.
> 
>  core/Makefile.inc    |   2 +-
>  core/cpu.c           |  22 +-
>  core/exceptions.c    |  68 +++-
>  core/fast-reboot.c   |  14 +-
>  core/init.c          | 173 ++++++--
>  core/mem_region.c    | 145 +++++--
>  core/opal.c          |  38 +-
>  core/platform.c      |  15 +-
>  core/vm.c            | 942 +++++++++++++++++++++++++++++++++++++++++++
>  hdata/spira.c        |  35 +-
>  hw/fake-nvram.c      |  12 +-
>  hw/homer.c           |  15 +-
>  hw/lpc-uart.c        |  32 +-
>  hw/lpc.c             |   6 +
>  hw/phb4.c            |   9 +-
>  hw/psi.c             |   2 +
>  hw/slw.c             |   4 +-
>  hw/xive.c            |   5 +
>  hw/xscom.c           |   4 +
>  include/cmpxchg.h    |   3 +
>  include/cpu.h        |  22 +
>  include/elf-abi.h    |  21 +-
>  include/io.h         | 119 ++++--
>  include/mem_region.h |   1 +
>  include/platform.h   |   4 +-
>  include/processor.h  |  13 +-
>  include/skiboot.h    |  27 ++
>  libstb/container.c   |  12 +-
>  libstb/cvc.c         |   3 +
>  libstb/secureboot.c  |   5 +-
>  libstb/trustedboot.c |   6 +-
>  skiboot.lds.S        |  26 +-
>  32 files changed, 1650 insertions(+), 155 deletions(-)
>  create mode 100644 core/vm.c
> 
> diff --git a/core/Makefile.inc b/core/Makefile.inc
> index 829800e5b..7a4bb6797 100644
> --- a/core/Makefile.inc
> +++ b/core/Makefile.inc
> @@ -3,7 +3,7 @@
>  # -*-Makefile-*-
>  
>  SUBDIRS += core
> -CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
> +CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o vm.o
>  CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
>  CORE_OBJS += opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
>  CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
> diff --git a/core/cpu.c b/core/cpu.c
> index 37d9f41a8..30f9c6e70 100644
> --- a/core/cpu.c
> +++ b/core/cpu.c
> @@ -416,6 +416,10 @@ static unsigned int cpu_idle_p8(enum cpu_wake_cause wake_on)
>  	}
>  	isync();
>  
> +	/* P8 must enter nap with VM disabled */
> +	if (cpu->vm_setup)
> +		vm_exit();
> +
>  	/* Enter nap */
>  	vec = enter_p8_pm_state(false);
>  
> @@ -476,11 +480,19 @@ static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
>  		/* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */
>  		psscr = PPC_BIT(42) | PPC_BIT(43) |
>  			PPC_BITMASK(54, 55) | PPC_BIT(63);
> +		/*
> +		 * stop with EC=1 wakes with vm off. P9 can stop with vm
> +		 * enabled, but it's simpler to disable now and so it wakes
> +		 * in the proper state.
> +		 */
> +		if (cpu->vm_setup)
> +			vm_exit();
>  		vec = enter_p9_pm_state(psscr);
>  	} else {
>  		/* stop with EC=0 (resumes) which does not require sreset. */
>  		/* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */
>  		psscr = PPC_BITMASK(54, 55) | PPC_BIT(63);
> +		/* Can run with VM enabled */
>  		enter_p9_pm_lite_state(psscr);
>  	}
>  
> @@ -499,6 +511,7 @@ static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
>  static void cpu_idle_pm(enum cpu_wake_cause wake_on)
>  {
>  	unsigned int vec;
> +	bool was_vm_setup = this_cpu()->vm_setup;
>  
>  	switch(proc_gen) {
>  	case proc_gen_p8:
> @@ -523,12 +536,17 @@ static void cpu_idle_pm(enum cpu_wake_cause wake_on)
>  		default:
>  			break;
>  		}
> -		mtmsrd(MSR_RI, 1);
>  
>  	} else if (vec == 0x200) {
>  		exception_entry_pm_mce();
>  		enable_machine_check();
> +	}
> +
> +	if (vec != 0) {
> +		/* 0x100 or 0x200 */
>  		mtmsrd(MSR_RI, 1);
> +		if (was_vm_setup)
> +			vm_enter();
>  	}
>  }
>  
> @@ -1361,7 +1379,7 @@ static int64_t opal_return_cpu(void)
>  		printf("OPAL in_opal_call=%u\n", this_cpu()->in_opal_call);
>  	}
>  
> -	__secondary_cpu_entry();
> +	__return_cpu_entry();
>  
>  	return OPAL_HARDWARE; /* Should not happen */
>  }
> diff --git a/core/exceptions.c b/core/exceptions.c
> index 389548d16..35c14f8af 100644
> --- a/core/exceptions.c
> +++ b/core/exceptions.c
> @@ -33,7 +33,7 @@ static void dump_regs(struct stack_frame *stack)
>  
>  #define EXCEPTION_MAX_STR 320
>  
> -static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bool *fatal)
> +static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bool *fatal, bool *vm_setup)
>  {
>  	uint64_t mce_flags, mce_addr;
>  	const char *mce_err;
> @@ -44,12 +44,28 @@ static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bo
>  	decode_mce(stack->srr0, stack->srr1, stack->dsisr, stack->dar,
>  			&mce_flags, &mce_err, &mce_addr);
>  
> -	/* Try to recover. */
> -	if (mce_flags & MCE_ERAT_ERROR) {
> -		/* Real-mode still uses ERAT, flush transient bitflips */
> +	/* Try to recover */
> +	if ((mce_flags & (MCE_SLB_ERROR|MCE_TABLE_WALK)) &&
> +			(msr & (MSR_IR|MSR_DR)) &&
> +			!this_cpu()->vm_local_map_inuse) {
> +		/* Try to turn off VM if non-linear map is not in use. */
> +		*vm_setup = false;
> +		stack->srr1 &= ~(MSR_IR|MSR_DR);
> +		mce_fix = "Disabling virtual memory";
> +
> +	} else if (mce_flags & MCE_ERAT_ERROR) {
>  		flush_erat();
>  		mce_fix = "ERAT flush";
>  
> +	} else if (mce_flags & MCE_TLB_ERROR) {
> +		cleanup_global_tlb();
> +		mce_fix = "global TLB flush";
> +
> +	} else if (mce_flags & MCE_TLB_ERROR) {
> +		cleanup_global_tlb();
> +		stack->srr0 += 4;
> +		mce_fix = "global TLB flush and skip instruction";
> +
>  	} else {
>  		*fatal = true;
>  	}
> @@ -83,6 +99,8 @@ static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bo
>  
>  void exception_entry(struct stack_frame *stack)
>  {
> +	struct cpu_thread *c = this_cpu();
> +	bool vm_setup = c->vm_setup;
>  	bool fatal = false;
>  	bool hv;
>  	uint64_t nip;
> @@ -90,6 +108,8 @@ void exception_entry(struct stack_frame *stack)
>  	char buf[EXCEPTION_MAX_STR];
>  	size_t l;
>  
> +	c->vm_setup = false;
> +
>  	switch (stack->type) {
>  	case 0x500:
>  	case 0x980:
> @@ -134,9 +154,44 @@ void exception_entry(struct stack_frame *stack)
>  		break;
>  
>  	case 0x200:
> -		handle_mce(stack, nip, msr, &fatal);
> +		handle_mce(stack, nip, msr, &fatal, &vm_setup);
>  		goto no_symbol;
>  
> +	case 0x300:
> +		if (vm_dsi(nip, stack->dar, stack->dsisr))
> +			goto out;
> +		fatal = true;
> +		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
> +			"Fatal %s address "REG" at "REG"   ",
> +			(stack->dsisr & DSISR_ISSTORE) ? "store" : "load",
> +			stack->dar, nip);
> +		break;
> +
> +	case 0x380:
> +		if (vm_dslb(nip, stack->dar))
> +			goto out;
> +		fatal = true;
> +		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
> +			"Fatal load/store address "REG" at "REG"   ",
> +			stack->dar, nip);
> +		break;
> +
> +	case 0x400:
> +		if (vm_isi(nip))
> +			goto out;
> +		fatal = true;
> +		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
> +			"Fatal ifetch at "REG"   ", nip);
> +		break;
> +
> +	case 0x480:
> +		if (vm_islb(nip))
> +			goto out;
> +		fatal = true;
> +		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
> +			"Fatal ifetch at "REG"   ", nip);
> +		break;
> +
>  	case 0x700: {
>  		struct trap_table_entry *tte;
>  
> @@ -185,11 +240,14 @@ no_symbol:
>  		for (;;) ;
>  	}
>  
> +out:
> +	assert(!fatal);
>  	if (hv) {
>  		/* Set up for SRR return */
>  		stack->srr0 = nip;
>  		stack->srr1 = msr;
>  	}
> +	c->vm_setup = vm_setup;
>  }
>  
>  void exception_entry_pm_sreset(void)
> diff --git a/core/fast-reboot.c b/core/fast-reboot.c
> index 03777543a..e7f3b5c67 100644
> --- a/core/fast-reboot.c
> +++ b/core/fast-reboot.c
> @@ -381,6 +381,9 @@ void __noreturn fast_reboot_entry(void)
>  	cpu_set_sreset_enable(true);
>  	cpu_set_ipi_enable(true);
>  
> +	/* Enter virtual memory mode */
> +	vm_init(true);
> +
>  	prlog(PR_INFO, "RESET: Releasing secondaries...\n");
>  
>  	/* Release everybody */
> @@ -401,6 +404,7 @@ void __noreturn fast_reboot_entry(void)
>  	fast_boot_release = false;
>  
>  	if (!chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
> +		void *t;
>  		/*
>  		 * mem_region_clear_unused avoids these preload regions
>  		 * so it can run along side image preloading. Clear these
> @@ -410,8 +414,14 @@ void __noreturn fast_reboot_entry(void)
>  		 * Mambo may have embedded payload here, so don't clear
>  		 * it at all.
>  		 */
> -		memset(KERNEL_LOAD_BASE, 0, KERNEL_LOAD_SIZE);
> -		memset(INITRAMFS_LOAD_BASE, 0, INITRAMFS_LOAD_SIZE);
> +
> +		t = vm_map((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true);
> +		memset(t, 0, KERNEL_LOAD_SIZE);
> +		vm_unmap((unsigned long)t, KERNEL_LOAD_SIZE);
> +
> +		t = vm_map((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true);
> +		memset(t, 0, INITRAMFS_LOAD_SIZE);
> +		vm_unmap((unsigned long)t, INITRAMFS_LOAD_SIZE);
>  	}
>  
>  	/* Start preloading kernel and ramdisk */
> diff --git a/core/init.c b/core/init.c
> index 2bb48845d..95c0339cf 100644
> --- a/core/init.c
> +++ b/core/init.c
> @@ -94,6 +94,7 @@ static bool try_load_elf64_le(struct elf_hdr *header)
>  	uint64_t load_base = (uint64_t)kh;
>  	struct elf64le_phdr *ph;
>  	unsigned int i;
> +	bool ret = false;
>  
>  	printf("INIT: 64-bit LE kernel discovered\n");
>  
> @@ -105,6 +106,9 @@ static bool try_load_elf64_le(struct elf_hdr *header)
>  	 * but it will not work for any ELF binary.
>  	 */
>  	ph = (struct elf64le_phdr *)(load_base + le64_to_cpu(kh->e_phoff));
> +	vm_map_global("KERNEL ELF Program Headers", (unsigned long)ph,
> +			le16_to_cpu(kh->e_phnum)*sizeof(struct elf64le_phdr),
> +			false, false);
>  	for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) {
>  		if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
>  			continue;
> @@ -121,7 +125,7 @@ static bool try_load_elf64_le(struct elf_hdr *header)
>  
>  	if (!kernel_entry) {
>  		prerror("INIT: Failed to find kernel entry !\n");
> -		return false;
> +		goto out_unmap;
>  	}
>  	kernel_entry += load_base;
>  	kernel_32bit = false;
> @@ -133,7 +137,12 @@ static bool try_load_elf64_le(struct elf_hdr *header)
>  	prlog(PR_DEBUG, "INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
>  	      kernel_entry, kernel_size);
>  
> -	return true;
> +	ret = true;
> +
> +out_unmap:
> +	vm_unmap_global((unsigned long)ph, le16_to_cpu(kh->e_phnum)*sizeof(struct elf64le_phdr));
> +
> +	return ret;
>  }
>  
>  static bool try_load_elf64(struct elf_hdr *header)
> @@ -144,12 +153,17 @@ static bool try_load_elf64(struct elf_hdr *header)
>  	struct elf64be_phdr *ph;
>  	struct elf64be_shdr *sh;
>  	unsigned int i;
> +	bool ret = false;
> +
> +	vm_map_global("KERNEL ELF64 Header", (unsigned long)header,
> +			sizeof(struct elf64be_hdr), false, false);
>  
>  	/* Check it's a ppc64 LE ELF */
>  	if (khle->ei_ident == ELF_IDENT		&&
>  	    khle->ei_data == ELF_DATA_LSB	&&
>  	    le16_to_cpu(khle->e_machine) == ELF_MACH_PPC64) {
> -		return try_load_elf64_le(header);
> +		ret = try_load_elf64_le(header);
> +		goto out_unmap1;
>  	}
>  
>  	/* Check it's a ppc64 ELF */
> @@ -157,7 +171,7 @@ static bool try_load_elf64(struct elf_hdr *header)
>  	    kh->ei_data != ELF_DATA_MSB		||
>  	    be16_to_cpu(kh->e_machine) != ELF_MACH_PPC64) {
>  		prerror("INIT: Kernel doesn't look like an ppc64 ELF\n");
> -		return false;
> +		goto out_unmap1;
>  	}
>  
>  	/* Look for a loadable program header that has our entry in it
> @@ -168,6 +182,8 @@ static bool try_load_elf64(struct elf_hdr *header)
>  	 * but it will not work for any ELF binary.
>  	 */
>  	ph = (struct elf64be_phdr *)(load_base + be64_to_cpu(kh->e_phoff));
> +	vm_map_global("KERNEL ELF Program Headers", (unsigned long)ph,
> +			be16_to_cpu(kh->e_phnum)*sizeof(struct elf64be_phdr), false, false);
>  	for (i = 0; i < be16_to_cpu(kh->e_phnum); i++, ph++) {
>  		if (be32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
>  			continue;
> @@ -184,7 +200,7 @@ static bool try_load_elf64(struct elf_hdr *header)
>  
>  	if (!kernel_entry) {
>  		prerror("INIT: Failed to find kernel entry !\n");
> -		return false;
> +		goto out_unmap2;
>  	}
>  
>  	/* For the normal big-endian ELF ABI, the kernel entry points
> @@ -194,6 +210,8 @@ static bool try_load_elf64(struct elf_hdr *header)
>  	 * to assuming it obeys the ABI.
>  	 */
>  	sh = (struct elf64be_shdr *)(load_base + be64_to_cpu(kh->e_shoff));
> +	vm_map_global("KERNEL ELF Section Headers", (unsigned long)sh,
> +			be16_to_cpu(kh->e_shnum)*sizeof(struct elf64be_shdr), false, false);
>  	for (i = 0; i < be16_to_cpu(kh->e_shnum); i++, sh++) {
>  		if (be64_to_cpu(sh->sh_addr) <= be64_to_cpu(kh->e_entry) &&
>  		    (be64_to_cpu(sh->sh_addr) + be64_to_cpu(sh->sh_size)) >
> @@ -218,7 +236,15 @@ static bool try_load_elf64(struct elf_hdr *header)
>  	printf("INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
>  	       kernel_entry, kernel_size);
>  
> -	return true;
> +	ret = true;
> +
> +	vm_unmap_global((unsigned long)sh, be16_to_cpu(kh->e_shnum)*sizeof(struct elf64be_shdr));
> +out_unmap2:
> +	vm_unmap_global((unsigned long)ph, be16_to_cpu(kh->e_phnum)*sizeof(struct elf64be_phdr));
> +out_unmap1:
> +	vm_unmap_global((unsigned long)header, sizeof(struct elf64be_hdr));
> +
> +	return ret;
>  }
>  
>  static bool try_load_elf32_le(struct elf_hdr *header)
> @@ -334,6 +360,7 @@ bool start_preload_kernel(void)
>  	int loaded;
>  
>  	/* Try to load an external kernel payload through the platform hooks */
> +	vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true, false);
>  	kernel_size = KERNEL_LOAD_SIZE;
>  	loaded = start_preload_resource(RESOURCE_ID_KERNEL,
>  					RESOURCE_SUBID_NONE,
> @@ -342,9 +369,11 @@ bool start_preload_kernel(void)
>  	if (loaded != OPAL_SUCCESS) {
>  		printf("INIT: platform start load kernel failed\n");
>  		kernel_size = 0;
> +		vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
>  		return false;
>  	}
>  
> +	vm_map_global("INITRAMFS", (unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true, false);
>  	initramfs_size = INITRAMFS_LOAD_SIZE;
>  	loaded = start_preload_resource(RESOURCE_ID_INITRAMFS,
>  					RESOURCE_SUBID_NONE,
> @@ -352,6 +381,7 @@ bool start_preload_kernel(void)
>  	if (loaded != OPAL_SUCCESS) {
>  		printf("INIT: platform start load initramfs failed\n");
>  		initramfs_size = 0;
> +		vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
>  		return false;
>  	}
>  
> @@ -361,13 +391,16 @@ bool start_preload_kernel(void)
>  static bool load_kernel(void)
>  {
>  	void *stb_container = NULL;
> -	struct elf_hdr *kh;
> +	struct elf_hdr *kh, *t;
> +	uint32_t ei_ident;
> +	uint8_t ei_class;
>  	int loaded;
>  
>  	prlog(PR_NOTICE, "INIT: Waiting for kernel...\n");
>  
>  	loaded = wait_for_resource_loaded(RESOURCE_ID_KERNEL,
>  					  RESOURCE_SUBID_NONE);
> +	vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
>  
>  	if (loaded != OPAL_SUCCESS) {
>  		printf("INIT: platform wait for kernel load failed\n");
> @@ -383,8 +416,10 @@ static bool load_kernel(void)
>  				((uint64_t)__builtin_kernel_start) -
>  				SKIBOOT_BASE + boot_offset;
>  			printf("Using built-in kernel\n");
> +			vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, kernel_size, true, false);
>  			memmove(KERNEL_LOAD_BASE, (void*)builtin_base,
>  				kernel_size);
> +			vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, kernel_size);
>  		}
>  	}
>  
> @@ -400,7 +435,7 @@ static bool load_kernel(void)
>  		if (kernel_entry < EXCEPTION_VECTORS_END) {
>  			cpu_set_sreset_enable(false);
>  			memcpy_null(NULL, old_vectors, EXCEPTION_VECTORS_END);
> -			sync_icache();
> +			sync_icache(0);
>  		} else {
>  			/* Hack for STB in Mambo, assume at least 4kb in mem */
>  			if (!kernel_size)
> @@ -431,15 +466,20 @@ static bool load_kernel(void)
>  	      "INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n",
>  	      kernel_size);
>  
> -	if (kh->ei_ident != ELF_IDENT) {
> +	t = vm_map((unsigned long)kh, sizeof(*kh), false);
> +	ei_ident = t->ei_ident;
> +	ei_class = t->ei_class;
> +	vm_unmap((unsigned long)t, sizeof(*kh));
> +
> +	if (ei_ident != ELF_IDENT) {
>  		prerror("INIT: ELF header not found. Assuming raw binary.\n");
>  		return true;
>  	}
>  
> -	if (kh->ei_class == ELF_CLASS_64) {
> +	if (ei_class == ELF_CLASS_64) {
>  		if (!try_load_elf64(kh))
>  			return false;
> -	} else if (kh->ei_class == ELF_CLASS_32) {
> +	} else if (ei_class == ELF_CLASS_32) {
>  		if (!try_load_elf32(kh))
>  			return false;
>  	} else {
> @@ -467,7 +507,7 @@ static void load_initramfs(void)
>  
>  	loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
>  					  RESOURCE_SUBID_NONE);
> -
> +	vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
>  	if (loaded != OPAL_SUCCESS || !initramfs_size)
>  		return;
>  
> @@ -539,6 +579,7 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
>  	const struct dt_property *memprop;
>  	const char *cmdline, *stdoutp;
>  	uint64_t mem_top;
> +	uint32_t *t;
>  
>  	memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem");
>  	if (memprop)
> @@ -613,11 +654,13 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
>  
>  	fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir);
>  
> +	t = vm_map(kernel_entry, 4, false);
>  	/* Check there is something there before we branch to it */
> -	if (*(uint32_t *)kernel_entry == 0) {
> +	if (*t == 0) {
>  		prlog(PR_EMERG, "FATAL: Kernel is zeros, can't execute!\n");
>  		assert(0);
>  	}
> +	vm_unmap(kernel_entry, 4);
>  
>  	if (platform.exit)
>  		platform.exit();
> @@ -629,7 +672,10 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
>  	printf("INIT: Starting kernel at 0x%llx, fdt at %p %u bytes\n",
>  	       kernel_entry, fdt, fdt_totalsize(fdt));
>  
> -	/* Disable machine checks on all */
> +	/* Go back to realmode and tear down our VM before booting kernel */
> +	vm_destroy();
> +
> +	/* Disable machine checks, RI on all */
>  	cpu_disable_ME_RI_all();
>  
>  	patch_traps(false);
> @@ -835,37 +881,60 @@ static void setup_branch_null_catcher(void)
>  
>  void copy_sreset_vector(void)
>  {
> +	static char patch[0x100];
>  	uint32_t *src, *dst;
> +	uint32_t *t;
> +	uint32_t len = (void *)&reset_patch_end - (void *)&reset_patch_start;
>  
>  	/* Copy the reset code over the entry point. */
>  	src = &reset_patch_start;
> +	t = vm_map((unsigned long)src, len, false);
> +	memcpy(patch, t, len);
> +	vm_unmap((unsigned long)src, len);
> +
>  	dst = (uint32_t *)0x100;
> -	while(src < &reset_patch_end)
> -		*(dst++) = *(src++);
> -	sync_icache();
> +	t = vm_map((unsigned long)dst, len, true);
> +	memcpy(t, patch, len);
> +	sync_icache((unsigned long)t);
> +	vm_unmap((unsigned long)dst, len);
>  }
>  
>  void copy_sreset_vector_fast_reboot(void)
>  {
> +	static char patch[0x100];
>  	uint32_t *src, *dst;
> +	uint32_t *t;
> +	uint32_t len = (void *)&reset_fast_reboot_patch_end -
> +			(void *)&reset_fast_reboot_patch_start;
>  
>  	/* Copy the reset code over the entry point. */
>  	src = &reset_fast_reboot_patch_start;
> +	t = vm_map((unsigned long)src, len, false);
> +	memcpy(patch, t, len);
> +	vm_unmap((unsigned long)src, len);
> +
>  	dst = (uint32_t *)0x100;
> -	while(src < &reset_fast_reboot_patch_end)
> -		*(dst++) = *(src++);
> -	sync_icache();
> +	t = vm_map((unsigned long)dst, len, true);
> +	memcpy(t, patch, len);
> +	sync_icache((unsigned long)t);
> +	vm_unmap((unsigned long)dst, len);
>  }
>  
>  void copy_exception_vectors(void)
>  {
> +	void *t;
> +
> +	t = vm_map(0x0, EXCEPTION_VECTORS_END, true);
> +
>  	/* Copy from 0x100 to EXCEPTION_VECTORS_END, avoid below 0x100 as
>  	 * this is the boot flag used by CPUs still potentially entering
>  	 * skiboot.
>  	 */
> -	memcpy((void *)0x100, (void *)(SKIBOOT_BASE + 0x100),
> +	memcpy(t + 0x100, (void *)(SKIBOOT_BASE + 0x100),
>  			EXCEPTION_VECTORS_END - 0x100);
> -	sync_icache();
> +
> +	sync_icache((unsigned long)t);
> +	vm_unmap(0x0, EXCEPTION_VECTORS_END);
>  }
>  
>  /*
> @@ -879,15 +948,16 @@ void patch_traps(bool enable)
>  	for (tte = __trap_table_start; tte < __trap_table_end; tte++) {
>  		uint32_t *insn;
>  
> -		insn = (uint32_t *)tte->address;
> +		insn = vm_map(tte->address, sizeof(uint32_t), true);
>  		if (enable) {
>  			*insn = PPC_INST_TRAP;
>  		} else {
>  			*insn = PPC_INST_NOP;
>  		}
> +		sync_icache((unsigned long)insn);
> +		vm_unmap(tte->address, sizeof(uint32_t));
>  	}
>  
> -	sync_icache();
>  }
>  
>  static void per_thread_sanity_checks(void)
> @@ -937,19 +1007,22 @@ void pci_nvram_init(void)
>  static uint32_t mem_csum(void *_p, void *_e)
>  {
>  	size_t len = _e - _p;
> -	uint32_t *p = _p;
> +	uint32_t *t;
>  	uint32_t v1 = 0, v2 = 0;
>  	uint32_t csum;
>  	unsigned int i;
>  
> +	t = vm_map((unsigned long)_p, len, false);
> +
>  	for (i = 0; i < len; i += 4) {
> -		uint32_t v = *p++;
> +		uint32_t v = *t++;
>  		v1 += v;
>  		v2 += v1;
>  	}
> -
>  	csum = v1 ^ v2;
>  
> +	vm_unmap((unsigned long)_p, len);
> +
>  	return csum;
>  }
>  
> @@ -963,6 +1036,8 @@ static void checksum_romem(void)
>  	if (chip_quirk(QUIRK_SLOW_SIM))
>  		return;
>  
> +	/* Called in real mode */
> +
>  	csum = mem_csum(_start, _head_end);
>  	romem_csum ^= csum;
>  
> @@ -1054,7 +1129,7 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>  	prlog(PR_DEBUG, "initial console log level: memory %d, driver %d\n",
>  	       (debug_descriptor.console_log_levels >> 4),
>  	       (debug_descriptor.console_log_levels & 0x0f));
> -	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology.\n");
> +	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology. Now with more indirection.\n");
>  
>  #ifdef SKIBOOT_GCOV
>  	skiboot_gcov_done();
> @@ -1066,6 +1141,9 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>  	/* Now locks can be used */
>  	init_locks();
>  
> +	/* Enter virtual memory mode */
> +	vm_init(false);
> +
>  	/* Create the OPAL call table early on, entries can be overridden
>  	 * later on (FSP console code for example)
>  	 */
> @@ -1091,7 +1169,20 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>  		if (parse_hdat(false) < 0)
>  			abort();
>  	} else {
> +		void *t;
> +		uint32_t size;
> +
> +		t = vm_map((unsigned long)fdt, sizeof(struct fdt_header), false);
> +		size = fdt_totalsize(t);
> +		vm_unmap((unsigned long)fdt, sizeof(struct fdt_header));
> +
> +		/*
> +		 * Would be nice to make this a local map, but it seems
> +		 * to need to be expanded in place.
> +		 */
> +		vm_map_global("fdt", (unsigned long)fdt, size, false, false);
>  		dt_expand(fdt);
> +		vm_unmap_global((unsigned long)fdt, size);
>  	}
>  	dt_add_cpufeatures(dt_root);
>  
> @@ -1142,6 +1233,8 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>  	 */
>  	init_cpu_max_pir();
>  
> +	vm_init_stacks();
> +
>  	/*
>  	 * Now, we init our memory map from the device-tree, and immediately
>  	 * reserve areas which we know might contain data coming from
> @@ -1393,6 +1486,30 @@ void __noreturn __secondary_cpu_entry(void)
>  	enable_machine_check();
>  	mtmsrd(MSR_RI, 1);
>  
> +	vm_init_secondary();
> +
> +	/* Some XIVE setup */
> +	xive_cpu_callin(cpu);
> +
> +	/* Wait for work to do */
> +	while(true) {
> +		if (cpu_check_jobs(cpu))
> +			cpu_process_jobs();
> +		else
> +			cpu_idle_job();
> +	}
> +}
> +
> +void __noreturn __return_cpu_entry(void)
> +{
> +	struct cpu_thread *cpu = this_cpu();
> +
> +	/* Secondary CPU called in */
> +	cpu_callin(cpu);
> +
> +	enable_machine_check();
> +	mtmsrd(MSR_RI, 1);
> +
>  	/* Some XIVE setup */
>  	xive_cpu_callin(cpu);
>  
> diff --git a/core/mem_region.c b/core/mem_region.c
> index 36de2d094..69f24d630 100644
> --- a/core/mem_region.c
> +++ b/core/mem_region.c
> @@ -25,7 +25,7 @@
>  #define POISON_MEM_REGION	0
>  #endif
>  #define POISON_MEM_REGION_WITH	0x99
> -#define POISON_MEM_REGION_LIMIT 1*1024*1024*1024
> +#define POISON_MEM_REGION_LIMIT (128*1024*1024 - PAGE_SIZE)
>  
>  /* Locking: The mem_region_lock protects the regions list from concurrent
>   * updates. Additions to, or removals from, the region list must be done
> @@ -57,24 +57,27 @@ static struct mem_region skiboot_os_reserve = {
>  	.type		= REGION_OS,
>  };
>  
> -struct mem_region skiboot_heap = {
> -	.name		= "ibm,firmware-heap",
> -	.start		= HEAP_BASE,
> -	.len		= HEAP_SIZE,
> -	.type		= REGION_SKIBOOT_HEAP,
> -};
> -
>  static struct mem_region skiboot_code_and_text = {
>  	.name		= "ibm,firmware-code",
>  	.start		= SKIBOOT_BASE,
>  	.len		= HEAP_BASE - SKIBOOT_BASE,
> +	.vm_mapped_len	= HEAP_BASE - SKIBOOT_BASE,
>  	.type		= REGION_SKIBOOT_FIRMWARE,
>  };
>  
> +struct mem_region skiboot_heap = {
> +	.name		= "ibm,firmware-heap",
> +	.start		= HEAP_BASE,
> +	.len		= HEAP_SIZE,
> +	.vm_mapped_len	= HEAP_SIZE,
> +	.type		= REGION_SKIBOOT_HEAP,
> +};
> +
>  static struct mem_region skiboot_after_heap = {
>  	.name		= "ibm,firmware-data",
>  	.start		= HEAP_BASE + HEAP_SIZE,
>  	.len		= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
> +	.vm_mapped_len	= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
>  	.type		= REGION_SKIBOOT_FIRMWARE,
>  };
>  
> @@ -141,17 +144,40 @@ static struct alloc_hdr *next_hdr(const struct mem_region *region,
>  	return next;
>  }
>  
> +static unsigned long vm_map_limit(const struct mem_region *region,
> +				  const struct alloc_hdr *hdr,
> +				  unsigned long size)
> +{
> +	unsigned long end = region->start + region->len;
> +	unsigned long limit;
> +
> +	assert((unsigned long)hdr >= region->start);
> +
> +	limit = (unsigned long)hdr + size;
> +	assert(limit <= end);
> +
> +	if (limit + sizeof(struct free_hdr) <= end)
> +		limit += sizeof(struct free_hdr);
> +
> +	return limit - region->start;
> +}
> +
>  #if POISON_MEM_REGION == 1
>  static void mem_poison(struct free_hdr *f)
>  {
> -	size_t poison_size = (void*)tailer(f) - (void*)(f+1);
> +	unsigned long start = (unsigned long)(f + 1);
> +	unsigned long *t = tailer(f);
> +	size_t poison_size = (unsigned long)t - start;
> +	void *mem;
>  
>  	/* We only poison up to a limit, as otherwise boot is
>  	 * kinda slow */
>  	if (poison_size > POISON_MEM_REGION_LIMIT)
>  		poison_size = POISON_MEM_REGION_LIMIT;
>  
> -	memset(f+1, POISON_MEM_REGION_WITH, poison_size);
> +	mem = vm_map(start, poison_size, true);
> +	memset(mem, POISON_MEM_REGION_WITH, poison_size);
> +	vm_unmap(start, poison_size);
>  }
>  #endif
>  
> @@ -159,14 +185,36 @@ static void mem_poison(struct free_hdr *f)
>  static void init_allocatable_region(struct mem_region *region)
>  {
>  	struct free_hdr *f = region_start(region);
> +	unsigned long num_longs;
> +	unsigned long *t;
> +
>  	assert(region->type == REGION_SKIBOOT_HEAP ||
>  	       region->type == REGION_MEMORY);
> -	f->hdr.num_longs = region->len / sizeof(long);
> +
> +	num_longs = region->len / sizeof(long);
> +
> +	assert(PAGE_SIZE >= sizeof(*f));
> +	assert(region->len >= PAGE_SIZE*2);
> +
> +	list_head_init(&region->free_list);
> +
> +	if (!region->vm_mapped_len) {
> +		/* SKIBOOT_BASE-SIZE regions already come mapped */
> +		vm_map_global(region->name, region->start, sizeof(struct free_hdr), true, false);
> +		region->vm_mapped_len = sizeof(struct free_hdr);
> +	} else {
> +		assert(region == &skiboot_heap);
> +	}
> +
> +	f->hdr.num_longs = num_longs;
>  	f->hdr.free = true;
>  	f->hdr.prev_free = false;
> -	*tailer(f) = f->hdr.num_longs;
> -	list_head_init(&region->free_list);
>  	list_add(&region->free_list, &f->list);
> +
> +	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
> +	*t = num_longs;
> +	vm_unmap((unsigned long)tailer(f), sizeof(long));
> +
>  #if POISON_MEM_REGION == 1
>  	mem_poison(f);
>  #endif
> @@ -176,6 +224,9 @@ static void make_free(struct mem_region *region, struct free_hdr *f,
>  		      const char *location, bool skip_poison)
>  {
>  	struct alloc_hdr *next;
> +	unsigned long *t;
> +	unsigned long new_end;
> +	unsigned long new_sz;
>  
>  #if POISON_MEM_REGION == 1
>  	if (!skip_poison)
> @@ -202,20 +253,33 @@ static void make_free(struct mem_region *region, struct free_hdr *f,
>  		list_add(&region->free_list, &f->list);
>  	}
>  
> -	/* Fix up tailer. */
> -	*tailer(f) = f->hdr.num_longs;
> -
> -	/* If next is free, coalesce it */
> +	/* If next is free coalesce it, else mark us as free. */
>  	next = next_hdr(region, &f->hdr);
>  	if (next) {
> -		next->prev_free = true;
>  		if (next->free) {
>  			struct free_hdr *next_free = (void *)next;
>  			list_del_from(&region->free_list, &next_free->list);
> -			/* Maximum of one level of recursion */
> -			make_free(region, next_free, location, true);
> +			f->hdr.num_longs += next_free->hdr.num_longs;
> +		} else {
> +			assert(!next->prev_free);
> +			next->prev_free = true;
> +			goto no_unmap;
>  		}
>  	}
> +
> +	/* Freed to the end, may have to trim mapping */
> +	new_end = (unsigned long)f + sizeof(struct free_hdr);
> +	new_sz = new_end - region->start;
> +	if (region != &skiboot_heap && new_sz < region->vm_mapped_len) {
> +		vm_unmap_global(new_end, region->vm_mapped_len - new_sz);
> +		region->vm_mapped_len = new_sz;
> +	}
> +
> +no_unmap:
> +	/* Fix up tailer. */
> +	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
> +	*t = f->hdr.num_longs;
> +	vm_unmap((unsigned long)tailer(f), sizeof(long));
>  }
>  
>  /* Can we fit this many longs with this alignment in this free block? */
> @@ -253,11 +317,12 @@ static void discard_excess(struct mem_region *region,
>  		post->hdr.num_longs = hdr->num_longs - alloc_longs;
>  		post->hdr.prev_free = false;
>  
> +		/* No coalescing required. */
> +		make_free(region, post, location, skip_poison);
> +
>  		/* Trim our block. */
>  		hdr->num_longs = alloc_longs;
>  
> -		/* This coalesces as required. */
> -		make_free(region, post, location, skip_poison);
>  	}
>  }
>  
> @@ -445,6 +510,18 @@ found:
>  	if (next) {
>  		assert(next->prev_free);
>  		next->prev_free = false;
> +	} else {
> +		unsigned long new_sz;
> +
> +		/* Took from the end, may have to expand mapping */
> +		new_sz = vm_map_limit(region, &f->hdr, (alloc_longs + offset) * sizeof(long));
> +		if (new_sz > region->vm_mapped_len) {
> +			assert(region != &skiboot_heap);
> +			vm_map_global(region->name,
> +				region->start + region->vm_mapped_len,
> +				new_sz - region->vm_mapped_len, true, false);
> +			region->vm_mapped_len = new_sz;
> +		}
>  	}
>  
>  	if (offset != 0) {
> @@ -536,6 +613,7 @@ bool mem_resize(struct mem_region *region, void *mem, size_t len,
>  {
>  	struct alloc_hdr *hdr, *next;
>  	struct free_hdr *f;
> +	unsigned long new_sz;
>  
>  	/* This should be a constant. */
>  	assert(is_rodata(location));
> @@ -566,6 +644,15 @@ bool mem_resize(struct mem_region *region, void *mem, size_t len,
>  	if (!next || !next->free || hdr->num_longs + next->num_longs < len)
>  		return false;
>  
> +	new_sz = vm_map_limit(region, hdr, len * sizeof(long));
> +	if (new_sz > region->vm_mapped_len) {
> +		assert(region != &skiboot_heap);
> +		vm_map_global(region->name,
> +			region->start + region->vm_mapped_len,
> +			new_sz - region->vm_mapped_len, true, false);
> +		region->vm_mapped_len = new_sz;
> +	}
> +
>  	/* OK, it's free and big enough, absorb it. */
>  	f = (struct free_hdr *)next;
>  	list_del_from(&region->free_list, &f->list);
> @@ -691,6 +778,7 @@ static struct mem_region *new_region(const char *name,
>  	region->name = name;
>  	region->start = start;
>  	region->len = len;
> +	region->vm_mapped_len = 0;
>  	region->node = node;
>  	region->type = type;
>  	region->free_list.n.next = NULL;
> @@ -1199,6 +1287,7 @@ void mem_region_release_unused(void)
>  			continue;
>  
>  		used_len = allocated_length(r);
> +		assert(used_len <= r->vm_mapped_len);
>  
>  		prlog(PR_INFO, "    %s: %llu/%llu used\n",
>  		       r->name, (long long)used_len, (long long)r->len);
> @@ -1227,6 +1316,10 @@ void mem_region_release_unused(void)
>  			}
>  			list_add(&regions, &for_linux->list);
>  		}
> +		if (r->vm_mapped_len > used_len) {
> +			vm_unmap_global(r->start + used_len, r->vm_mapped_len - used_len);
> +			r->vm_mapped_len = used_len;
> +		}
>  	}
>  	unlock(&mem_region_lock);
>  }
> @@ -1271,9 +1364,13 @@ static void mem_clear_range(uint64_t s, uint64_t e)
>  		return;
>  	}
>  
> -	prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
> -	      (long long)s, (long long)e);
> +	/*
> +	 * Large clear thrashes the small hash table, with parallel clearing
> +	 * this can livelock. Clear in real mode.
> +	 */
> +	vm_exit();
>  	memset((void *)s, 0, e - s);
> +	vm_enter();
>  }
>  
>  struct mem_region_clear_job_args {
> diff --git a/core/opal.c b/core/opal.c
> index 46518c445..9ab7391d1 100644
> --- a/core/opal.c
> +++ b/core/opal.c
> @@ -44,19 +44,39 @@ static uint64_t opal_dynamic_events;
>  extern uint32_t attn_trigger;
>  extern uint32_t hir_trigger;
>  
> +void __opal_register(uint64_t token, void *func, unsigned int nargs)
> +{
> +	uint64_t f;
> +	uint64_t *t;
> +	u8 *a;
> +
> +	assert(token <= OPAL_LAST);
> +
> +	f = function_entry_address(func);
> +
> +	t = vm_map((unsigned long)&opal_branch_table[token], sizeof(*t), true);
> +	*t = f;
> +	vm_unmap((unsigned long)&opal_branch_table[token], sizeof(*t));
> +
> +	a = vm_map((unsigned long)&opal_num_args[token], sizeof(*a), true);
> +	*a = nargs;
> +	vm_unmap((unsigned long)&opal_num_args[token], sizeof(*a));
> +}
>  
>  void opal_table_init(void)
>  {
>  	struct opal_table_entry *s = __opal_table_start;
>  	struct opal_table_entry *e = __opal_table_end;
> +	struct opal_table_entry *te;
> +	size_t size = (void *)e - (void *)s;
>  
>  	prlog(PR_DEBUG, "OPAL table: %p .. %p, branch table: %p\n",
>  	      s, e, opal_branch_table);
> -	while(s < e) {
> -		((uint64_t *)opal_branch_table)[s->token] = function_entry_address(s->func);
> -		((u8 *)opal_num_args)[s->token] = s->nargs;
> -		s++;
> -	}
> +
> +	vm_map_global("OPAL table", (unsigned long)s, size, false, false);
> +	for (te = s; te < e; te++)
> +		__opal_register(te->token, te->func, te->nargs);
> +	vm_unmap_global((unsigned long)s, size);
>  }
>  
>  /* Called from head.S, thus no prototype */
> @@ -317,14 +337,6 @@ int64_t opal_quiesce(uint32_t quiesce_type, int32_t cpu_target)
>  }
>  opal_call(OPAL_QUIESCE, opal_quiesce, 2);
>  
> -void __opal_register(uint64_t token, void *func, unsigned int nargs)
> -{
> -	assert(token <= OPAL_LAST);
> -
> -	((uint64_t *)opal_branch_table)[token] = function_entry_address(func);
> -	((u8 *)opal_num_args)[token] = nargs;
> -}
> -
>  /*
>   * add_opal_firmware_exports_node: adds properties to the device-tree which
>   * the OS will then change into sysfs nodes.
> diff --git a/core/platform.c b/core/platform.c
> index 8f4a3b877..839cf97ee 100644
> --- a/core/platform.c
> +++ b/core/platform.c
> @@ -242,8 +242,10 @@ void set_bmc_platform(const struct bmc_platform *bmc)
>  
>  void probe_platform(void)
>  {
> -	struct platform *platforms = &__platforms_start;
> -	unsigned int i;
> +	struct platform *s = __platforms_start;
> +	struct platform *e = __platforms_end;
> +	struct platform *p;
> +	size_t size = (void *)e - (void *)s;
>  
>  	/* Detect Manufacturing mode */
>  	if (dt_find_property(dt_root, "ibm,manufacturing-mode")) {
> @@ -257,12 +259,15 @@ void probe_platform(void)
>  		manufacturing_mode = true;
>  	}
>  
> -	for (i = 0; &platforms[i] < &__platforms_end; i++) {
> -		if (platforms[i].probe && platforms[i].probe()) {
> -			platform = platforms[i];
> +	vm_map_global("Platform table", (unsigned long)s, size, false, false);
> +	for (p = s; p < e; p++) {
> +		if (p->probe && p->probe()) {
> +			platform = *p;
>  			break;
>  		}
>  	}
> +	vm_unmap_global((unsigned long)s, size);
> +
>  	if (!platform.name) {
>  		platform = generic_platform;
>  		if (platform.probe)
> diff --git a/core/vm.c b/core/vm.c
> new file mode 100644
> index 000000000..84534796c
> --- /dev/null
> +++ b/core/vm.c
> @@ -0,0 +1,942 @@
> +/* Copyright 2018 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + * 	http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <ccan/container_of/container_of.h>
> +#include <ccan/list/list.h>
> +#include <ccan/str/str.h>
> +#include <cmpxchg.h>
> +#include <cpu.h>
> +#include <opal.h>
> +#include <skiboot.h>
> +#include <stack.h>
> +#include <timebase.h>
> +#include <trace.h>
> +
> +static bool vm_setup = false;
> +static bool vm_globals_allocated = false;
> +
> +#define SLB_SZ			(256UL*1024*1024)
> +#define SLB_NR			32
> +#define LOCAL_SLB_NR		2
> +#define GLOBAL_SLB_NR		(SLB_NR - LOCAL_SLB_NR)
> +#define LOCAL_SLB_BASE		GLOBAL_SLB_NR
> +
> +#define LOCAL_EA_PERCPU		(SLB_SZ)
> +#define LOCAL_EA_BEGIN		0x0008000000000000ULL
> +#define LOCAL_EA_END		0x0009000000000000ULL
> +
> +static void __nomcount slb_install(unsigned long esid, unsigned long vsid, unsigned int index)
> +{
> +	unsigned long rs;
> +	unsigned long rb;
> +
> +	rs = vsid << (63-51);		/* 256MB VSID */
> +	rs |= 1UL << (63-53);		/* Kp = 1 */
> +	if (PAGE_SIZE == 0x10000) {
> +		rs |= 1UL << (63-55);		/* L = 1 */
> +		rs |= 1UL << (63-59);		/* LP = 01 */
> +	}
> +
> +	rb = esid << (63-35);		/* 256MB ESID */
> +	rb |= 1UL << (63-36);		/* V = 1 */
> +	rb |= index;
> +
> +	asm volatile("slbmte %0,%1" : : "r"(rs), "r"(rb) : "memory");
> +}
> +
> +#if 0
> +static void slb_remove(unsigned long esid)
> +{
> +	asm volatile("isync ; slbie %0 ; isync" : : "r"(esid << 28) : "memory");
> +}
> +#endif
> +
> +static void slb_remove_all(void)
> +{
> +	asm volatile("isync ; slbmte %0,%0 ; slbia ; isync" : : "r"(0) : "memory");
> +}
> +
> +static void __nomcount slb_add(unsigned long ea)
> +{
> +	struct cpu_thread *cpu = this_cpu();
> +	uint64_t esid = ea >> 28;
> +	uint64_t vsid = ea >> 28;
> +
> +	slb_install(esid, vsid, cpu->vm_slb_rr);
> +
> +	cpu->vm_slb_rr++;
> +	if (cpu->vm_slb_rr == GLOBAL_SLB_NR)
> +		cpu->vm_slb_rr = 0;
> +}
> +
> +struct hpte {
> +	beint64_t dword[2];
> +};
> +
> +struct hpteg {
> +	struct hpte hpte[8];
> +};
> +
> +static struct hpteg *htab;
> +static unsigned long htab_shift;
> +static unsigned long htab_pteg_mask;
> +
> +static struct lock htab_lock;
> +
> +static void __nomcount htab_install(unsigned long va, unsigned long pa, int rw, int ex, int ci, bool local)
> +{
> +	unsigned long hash;
> +	struct hpteg *hpteg;
> +	struct hpte *hpte;
> +	unsigned long ava = va >> 23;
> +	unsigned long arpn = pa >> 12;
> +	unsigned long dw0, dw1;
> +	unsigned long _dw0;
> +	unsigned long _ava;
> +	unsigned int hstart, hend;
> +	unsigned int i;
> +
> +	if (PAGE_SIZE == 0x10000)
> +		arpn >>= 4;
> +
> +	dw0 = ava << (63-56); /* AVA = ava */
> +	dw0 |= 0x1; /* V = 1 */
> +	if (PAGE_SIZE == 0x10000)
> +		dw0 |= 0x4; /* L = 1 */
> +	if (local)
> +		dw0 |= 0x8; /* SW[0] = 1 */
> +
> +	if (PAGE_SIZE == 0x10000) {
> +		dw1 = (arpn << (63-43 - 4)); /* ARPN||LP-4 = arpn */
> +		dw1 |= (0x1 << (63-43 - 8)); /* LP = 0001 */
> +	} else
> +		dw1 = (arpn << (63-43 - 8)); /* ARPN||LP = arpn */
> +	if (!rw)
> +		dw1 |= (1UL << (63 - 0)) | (1UL << (63 - 63 + 1)); /* pp = 110 */
> +	if (!ex)
> +		dw1 |= (1UL << (63 - 61)); /* N = 1 */
> +	dw1 |= (1UL << (63 - 60 + 1)); /* WIMG = 0010 */
> +	if (ci)
> +		dw1 |= (1UL << (63 - 60)) | (1UL << (63 - 60 + 2)); /* WIMG = 0111 */
> +	dw1 |= (1UL << (63 - 55)) | (1UL << (63 - 56)); /* R=C=1 */
> +
> +	if (PAGE_SIZE == 0x10000)
> +		hash = ((va >> 16) & 0xfff) ^ ((va >> 28) & 0x7fffffffffUL);
> +	else
> +		hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
> +	hpteg = &htab[hash & htab_pteg_mask];
> +
> +	lock(&htab_lock);
> +
> +	hstart = 0;
> +	hend = 7;
> +
> +	for (i = hstart; i <= hend; i++) {
> +		hpte = &hpteg->hpte[i];
> +
> +		_dw0 = be64_to_cpu(hpte->dword[0]);
> +		if (_dw0 & 1) {
> +			_ava = _dw0 >> (63 - 56);
> +			if (_ava == ava) {
> +				assert(!local);
> +				/* This could happen with racing global fault */
> +				assert(dw0 == _dw0);
> +				assert(dw1 == be64_to_cpu(hpte->dword[1]));
> +				goto out;
> +			}
> +
> +			continue;
> +		}
> +
> +		assert(!_dw0);
> +		goto install;
> +	}
> +
> +	i = mftb();
> +	i = (i ^ (i >> 4)) & 0x7;
> +	hpte = &hpteg->hpte[i];
> +
> +install:
> +	hpte->dword[1] = cpu_to_be64(dw1);
> +	eieio();
> +	hpte->dword[0] = cpu_to_be64(dw0);
> +	asm volatile("ptesync" ::: "memory");
> +out:
> +	unlock(&htab_lock);
> +}
> +
> +static void htab_remove(unsigned long va, int local)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	bool vm_setup = c->vm_setup;
> +	unsigned long hash;
> +	struct hpteg *hpteg;
> +	unsigned long ava = va >> 23;
> +	unsigned long dw0;
> +	unsigned long rb;
> +	unsigned int hstart, hend;
> +	unsigned int i;
> +
> +	dw0 = ava << (63-56);
> +	dw0 |= 0x1;
> +	if (PAGE_SIZE == 0x10000)
> +		dw0 |= 0x4;
> +	if (local)
> +		dw0 |= 0x8;
> +
> +	if (PAGE_SIZE == 0x10000)
> +		hash = ((va >> 16) & 0xfff) ^ ((va >> 28) & 0x7fffffffffUL);
> +	else
> +		hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
> +	hpteg = &htab[hash & htab_pteg_mask];
> +
> +	if (vm_setup)
> +		vm_exit();
> +	lock(&htab_lock);
> +	hstart = 0;
> +	hend = 7;
> +
> +	for (i = hstart; i <= hend; i++) {
> +		struct hpte *hpte = &hpteg->hpte[i];
> +		beint64_t _raw_dw0;
> +		uint64_t _dw0;
> +
> +		_raw_dw0 = hpte->dword[0];
> +		_dw0 = be64_to_cpu(_raw_dw0);
> +
> +		if (!(_dw0 & 1)) {
> +			assert(!_raw_dw0);
> +			continue;
> +		}
> +
> +		if (_dw0 != dw0)
> +			continue;
> +
> +		hpte->dword[0] = 0;
> +		eieio();
> +		hpte->dword[1] = 0;
> +
> +		break;
> +	}
> +
> +	if (PAGE_SIZE == 0x10000) {
> +		rb = (va >> 16) << (63 - 47); /* AVA||LP-4 */
> +		rb |= 0x1 << (63 - 51); /* LP=0001 */
> +		rb |= 0x1; /* L=1 */
> +	} else {
> +		rb = va & ~0xfffUL;
> +	}
> +
> +	unlock(&htab_lock);
> +
> +	if (vm_setup)
> +		vm_enter();
> +
> +	if (local) {
> +		asm volatile("ptesync" ::: "memory");
> +		asm volatile("tlbiel %0" : : "r"(rb));
> +		asm volatile("ptesync" ::: "memory");
> +	} else {
> +		asm volatile("ptesync" ::: "memory");
> +		asm volatile("tlbie %0,%1" : : "r"(rb), "r"(0));
> +		asm volatile("eieio ; tlbsync ; ptesync" ::: "memory");
> +
> +	}
> +}
> +
> +/*
> + * Try to fix problems in callers if !strict.
> + */
> +static bool vm_strict = false;
> +
> +static struct list_head vm_maps = LIST_HEAD_INIT(vm_maps);
> +static struct lock vm_maps_lock;
> +static unsigned long nr_vm_maps;
> +
> +static void __vm_map(const char *name, unsigned long addr, unsigned long len, unsigned long pa, bool r, bool w, bool x, bool ci, bool local)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	bool vm_setup = c->vm_setup;
> +	struct vm_map *new;
> +	struct vm_map *vmm;
> +
> +	if (local) {
> +		new = &c->vm_local_map;
> +		new->name = name;
> +		new->address = addr;
> +		new->length = len;
> +		new->pa = pa;
> +		new->readable = r;
> +		new->writeable = w;
> +		new->executable = x;
> +		new->ci = ci;
> +
> +		return;
> +	}
> +
> +	new = zalloc(sizeof(*new));
> +	assert(new);
> +
> +	new->name = name;
> +	new->address = addr;
> +	new->length = len;
> +	new->pa = pa;
> +	new->readable = r;
> +	new->writeable = w;
> +	new->executable = x;
> +	new->ci = ci;
> +
> +	/* Can not take a d-side fault while holding this lock */
> +	if (vm_setup)
> +		vm_exit();
> +	lock(&vm_maps_lock);
> +
> +	list_for_each(&vm_maps, vmm, list) {
> +		unsigned long ps = addr & ~(PAGE_SIZE - 1);
> +		unsigned long pe = (addr + len + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
> +		unsigned long vmm_ps = vmm->address & ~(PAGE_SIZE - 1);
> +		unsigned long vmm_pe = (vmm->address + vmm->length + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
> +		bool mergeable = false;
> +		bool samepage = false;
> +
> +		/* Ensure no overlap */
> +		assert(addr + len <= vmm->address || addr >= vmm->address + vmm->length);
> +
> +		if (ps > vmm_pe)
> +			continue; /* Sort */
> +		if (pe < vmm_ps) {
> +			/* Not same or adjacent page is easy */
> +			list_add_before(&vm_maps, &new->list, &vmm->list);
> +			goto found;
> +		}
> +		if (pe > vmm_ps || ps < vmm_pe)
> +			samepage = true;
> +
> +		mergeable =	/* XXX: check pa */ 1 &&
> +				(vmm->ci == ci) &&
> +				(vmm->readable == r) &&
> +				(vmm->writeable == w) &&
> +				(vmm->executable == x);
> +		samepage = false;
> +
> +		if (samepage && !mergeable) {
> +			printf("VMM: %s (%lx-%lx) mismatched permissions with same page mapping %s (%llx-%llx)\n", name, addr, addr + len, vmm->name, vmm->address, vmm->address + vmm->length);
> +			assert(vmm->pa == pa);
> +			assert(vmm->ci == ci);
> +			assert(vmm->readable == r);
> +			assert(vmm->writeable == w);
> +			assert(vmm->executable == x);
> +		}
> +
> +		if (!strcmp(name, vmm->name) && mergeable) {
> +			if (addr == vmm->address + vmm->length) {
> +				free(new);
> +				vmm->length += len;
> +				goto done;
> +			}
> +
> +			if (addr + len == vmm->address) {
> +				free(new);
> +				vmm->address = addr;
> +				vmm->pa = pa;
> +				vmm->length += len;
> +				goto done;
> +			}
> +		}
> +
> +		if (addr >= vmm->address + vmm->length)
> +			continue;
> +		if (addr + len <= vmm->address) {
> +			list_add_before(&vm_maps, &new->list, &vmm->list);
> +			goto found;
> +		}
> +
> +		assert(0);
> +	}
> +	list_add_tail(&vm_maps, &new->list);
> +found:
> +	nr_vm_maps++;
> +done:
> +	unlock(&vm_maps_lock);
> +	if (vm_setup)
> +		vm_enter();
> +}
> +
> +static void __vm_unmap(unsigned long addr, unsigned long len, bool local)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	bool vm_setup = c->vm_setup;
> +	unsigned long end = addr + len;
> +	struct vm_map *vmm, *to_free = NULL;
> +
> +	if (local) {
> +		vmm = &c->vm_local_map;
> +		assert(addr == vmm->address);
> +		assert(len == vmm->length);
> +		memset(vmm, 0, sizeof(struct vm_map));
> +
> +		if (vm_setup) {
> +			while (addr < end) {
> +				htab_remove(addr, local);
> +				addr += PAGE_SIZE;
> +			}
> +		}
> +
> +		return;
> +	}
> +
> +	/* Can not take a d-side fault while holding this lock */
> +	if (vm_setup)
> +		vm_exit();
> +	lock(&vm_maps_lock);
> +	list_for_each(&vm_maps, vmm, list) {
> +		struct vm_map *new;
> +
> +		if (addr + len <= vmm->address)
> +			continue;
> +		if (addr >= vmm->address + vmm->length)
> +			continue;
> +		if (addr == vmm->address && len == vmm->length) {
> +			to_free = vmm;
> +			goto found;
> +		}
> +
> +		if (addr == vmm->address) {
> +			vmm->address += len;
> +			vmm->pa += len;
> +			vmm->length -= len;
> +			goto done;
> +		}
> +
> +		if (addr + len == vmm->address + vmm->length) {
> +			vmm->length -= len;
> +			goto done;
> +		}
> +
> +		/* Unmaps will never span multiple because they always apply to a previous map, so this is a split */
> +		new = zalloc(sizeof(*new));
> +		assert(new);
> +		memcpy(new, vmm, sizeof(*new));
> +		list_add_before(&vm_maps, &new->list, &vmm->list);
> +		nr_vm_maps++;
> +
> +		new->length = addr - new->address;
> +		vmm->address += new->length + len;
> +		vmm->pa += new->length + len;
> +		vmm->length -= new->length + len;
> +		goto done;
> +	}
> +	vmm = NULL;
> +	unlock(&vm_maps_lock);
> +	if (!vm_strict) {
> +		prerror("unmap didn't find anything\n");
> +		backtrace();
> +		goto out;
> +	}
> +	assert(0);
> +
> +found:
> +	list_del(&vmm->list);
> +	nr_vm_maps--;
> +done:
> +	if (vm_setup) {
> +		while (addr < end) {
> +			htab_remove(addr, local);
> +			addr += PAGE_SIZE;
> +		}
> +	}
> +
> +	unlock(&vm_maps_lock);
> +out:
> +	if (vm_setup)
> +		vm_enter();
> +
> +	if (to_free)
> +		free(to_free);
> +}
> +
> +
> +void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci)
> +{
> +	assert(this_cpu()->state != cpu_state_os);
> +	__vm_map(name, addr, len, addr, true, rw, false, ci, false);
> +}
> +
> +void vm_map_global_text(const char *name, unsigned long addr, unsigned long len)
> +{
> +	assert(this_cpu()->state != cpu_state_os);
> +	__vm_map(name, addr, len, addr, true, false, true, false, false);
> +}
> +
> +void vm_unmap_global(unsigned long addr, unsigned long len)
> +{
> +	assert(this_cpu()->state != cpu_state_os);
> +	__vm_unmap(addr, len, false);
> +}
> +
> +
> +void *vm_map(unsigned long addr, unsigned long len, bool rw)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	unsigned long newaddr;
> +	unsigned long end;
> +	unsigned long offset = addr & (PAGE_SIZE - 1);
> +
> +	end = (addr + len + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
> +	addr &= ~(PAGE_SIZE - 1);
> +	len = end - addr;
> +
> +	assert(len <= LOCAL_EA_PERCPU);
> +
> +	/* Can't do nested mappings */
> +	assert(!c->vm_local_map_inuse);
> +	c->vm_local_map_inuse = true;
> +
> +	if (c->vm_setup) {
> +		newaddr = LOCAL_EA_BEGIN + LOCAL_EA_PERCPU * c->pir;
> +		__vm_map("local", newaddr, len, addr, true, rw, false, false, true);
> +	} else {
> +		newaddr = addr;
> +	}
> +
> +	return (void *)newaddr + offset;
> +}
> +
> +void vm_unmap(unsigned long addr, unsigned long len)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	unsigned long newaddr;
> +	unsigned long end;
> +
> +	end = (addr + len + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
> +	addr &= ~(PAGE_SIZE - 1);
> +	len = end - addr;
> +
> +	assert(len <= LOCAL_EA_PERCPU);
> +
> +	assert(c->vm_local_map_inuse);
> +	c->vm_local_map_inuse = false;
> +
> +	if (c->vm_setup) {
> +		newaddr = LOCAL_EA_BEGIN + LOCAL_EA_PERCPU * c->pir;
> +		__vm_unmap(newaddr, len, true);
> +	}
> +}
> +
> +struct prte {
> +	beint64_t dword[2];
> +};
> +
> +static struct prte *prtab;
> +static unsigned long old_lpcr;
> +static unsigned long new_lpcr;
> +
> +static void vm_init_cpu(void)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	unsigned long ea = LOCAL_EA_BEGIN + LOCAL_EA_PERCPU * c->pir;
> +	unsigned long esid = ea >> 28;
> +	unsigned long vsid = ea >> 28;
> +
> +	mtspr(SPR_LPCR, new_lpcr);
> +
> +	mtspr(SPR_LPID, 0);
> +	mtspr(SPR_PID, 0);
> +	mtspr(SPR_HRMOR, 0);
> +	mtspr(SPR_PTCR, (unsigned long)prtab);
> +	mtspr(SPR_AMR, 0);
> +	mtspr(SPR_IAMR, 0);
> +	mtspr(SPR_AMOR, 0);
> +	mtspr(SPR_UAMOR, 0);
> +
> +	slb_remove_all();
> +	slb_install(esid, vsid, LOCAL_SLB_BASE);
> +}
> +
> +void vm_init_secondary(void)
> +{
> +	vm_init_cpu();
> +	vm_enter();
> +}
> +
> +bool vm_realmode(void)
> +{
> +	struct cpu_thread *c = this_cpu();
> +
> +	return !vm_setup || !c->vm_setup;
> +}
> +
> +void vm_enter(void)
> +{
> +	struct cpu_thread *c = this_cpu();
> +
> +	assert(vm_setup);
> +	if (c->vm_setup) {
> +		prerror("CPU:%d vm_enter already entered\n", c->pir);
> +		backtrace();
> +	}
> +	if (c->vm_local_map_inuse) {
> +		prerror("CPU:%d vm_enter local map inuse\n", c->pir);
> +		backtrace();
> +	}
> +
> +	c->vm_setup = true;
> +	mtmsr(mfmsr() | (MSR_IR|MSR_DR));
> +}
> +
> +void vm_exit(void)
> +{
> +	struct cpu_thread *c = this_cpu();
> +
> +	assert(vm_setup);
> +	if (!c->vm_setup) {
> +		prerror("CPU:%d vm_exit already exited\n", c->pir);
> +		backtrace();
> +	}
> +	if (c->vm_local_map_inuse) {
> +		prerror("CPU:%d vm_enter local map inuse\n", c->pir);
> +		backtrace();
> +	}
> +	c->vm_setup = false;
> +	mtmsr(mfmsr() & ~(MSR_IR|MSR_DR));
> +}
> +
> +bool __nomcount vm_dslb(uint64_t nia, uint64_t dar)
> +{
> +	/*
> +	 * Per-cpu map ranges are bolted to per-cpu SLBs.
> +	 */
> +	assert((dar < LOCAL_EA_BEGIN) ||
> +		(dar >= LOCAL_EA_END));
> +
> +	(void)nia;
> +	slb_add(dar);
> +
> +	return true;
> +}
> +
> +bool __nomcount vm_islb(uint64_t nia)
> +{
> +	slb_add(nia);
> +
> +	return true;
> +}
> +
> +bool __nomcount vm_dsi(uint64_t nia, uint64_t dar, uint32_t dsisr)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	struct vm_map *vmm;
> +	uint64_t pa;
> +	bool store = !!(dsisr & DSISR_ISSTORE);
> +	bool ret = true;
> +	bool local;
> +
> +	if (dsisr & 0xbdffffffU) {
> +		printf("Page fault bad dsisr at 0x%016llx dar=0x%016llx dsisr=0x%08x\n", nia, dar, dsisr);
> +		return false;
> +	}
> +
> +	if ((dar >= LOCAL_EA_BEGIN) && (dar < LOCAL_EA_END)) {
> +		local = true;
> +		vmm = &c->vm_local_map;
> +		if (dar >= vmm->address && dar < vmm->address + vmm->length)
> +			goto found;
> +		goto not_found;
> +	}
> +
> +	local = false;
> +
> +	lock(&vm_maps_lock);
> +	list_for_each(&vm_maps, vmm, list) {
> +		assert(vmm->pa == vmm->address);
> +		if (dar >= vmm->address && dar < vmm->address + vmm->length)
> +			goto found;
> +	}
> +	if (!vm_strict) {
> +		if (dar >= 0x0006000000000000 && dar < 0x0007000000000000)
> +			/* MMIO */
> +			htab_install(dar, dar, 1, 0, 1, false);
> +		else if (dar < LOCAL_EA_BEGIN)
> +			htab_install(dar, dar, 1, 0, 0, false);
> +		else
> +			ret = false;
> +		unlock(&vm_maps_lock);
> +		prerror("Page fault with no VMM at NIA:0x%016llx DAR:0x%016llx, store:%d\n", nia, dar, store);
> +		backtrace();
> +		list_for_each(&vm_maps, vmm, list)
> +			prlog(PR_DEBUG, "%28s 0x%08llx-0x%08llx\n", vmm->name,
> +				vmm->address, vmm->address + vmm->length);
> +		goto out;
> +	}
> +	unlock(&vm_maps_lock);
> +not_found:
> +	prerror("  vmm not found\n");
> +	ret = false;
> +	assert(0);
> +	goto out;
> +
> +found:
> +	pa = vmm->pa + (dar & ~(PAGE_SIZE - 1)) - vmm->address;
> +	if (!vmm->readable) {
> +		if (!local)
> +			unlock(&vm_maps_lock);
> +		prerror("  vmm not readable\n");
> +		ret = false;
> +		assert(0);
> +		goto out;
> +	}
> +	if (store && !vmm->writeable) {
> +		if (!vm_strict) {
> +			htab_install(dar, pa, store, 0, vmm->ci, local);
> +			if (!local)
> +				unlock(&vm_maps_lock);
> +			prerror("Page fault store to RO VMM:%s at NIA:0x%016llx DAR:0x%016llx\n", vmm->name, nia, dar);
> +			backtrace();
> +			goto out;
> +		}
> +		if (!local)
> +			unlock(&vm_maps_lock);
> +		prerror("  vmm not writeable\n");
> +		ret = false;
> +		assert(0);
> +		goto out;
> +	}
> +
> +	htab_install(dar, pa, vmm->writeable, vmm->executable, vmm->ci, local);
> +	if (!local)
> +		unlock(&vm_maps_lock);
> +
> +out:
> +	return ret;
> +}
> +
> +bool __nomcount vm_isi(uint64_t nia)
> +{
> +	struct vm_map *vmm;
> +
> +	lock(&vm_maps_lock);
> +	list_for_each(&vm_maps, vmm, list) {
> +		assert(vmm->pa == vmm->address);
> +		if (nia >= vmm->address && nia < vmm->address + vmm->length) {
> +			if (!vmm->executable)
> +				prerror("Page fault at NIA:0x%016llx NX mapping!\n", nia);
> +			goto found;
> +		}
> +	}
> +
> +	prerror("Page fault, no mapping for NIA:0x%016llx !\n", nia);
> +
> +found:
> +	unlock(&vm_maps_lock);
> +	htab_install(nia, nia, 0, 1, 0, false);
> +
> +	return true;
> +}
> +
> +static void cpu_stop_vm(void *arg __unused)
> +{
> +	vm_exit();
> +}
> +
> +static void cpu_cleanup_vm(void *arg __unused)
> +{
> +	slb_remove_all();
> +	mtspr(SPR_PTCR, 0);
> +	mtspr(SPR_LPCR, old_lpcr);
> +}
> +
> +static void cpu_all_destroy_vm(void)
> +{
> +	struct cpu_thread *cpu;
> +	struct cpu_job **jobs;
> +
> +	jobs = zalloc(sizeof(struct cpu_job *) * cpu_max_pir + 1);
> +	assert(jobs);
> +
> +	/* Stop all CPUs */
> +	for_each_available_cpu(cpu) {
> +		if (cpu == this_cpu())
> +			continue;
> +		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_stop_vm",
> +						cpu_stop_vm, NULL);
> +	}
> +
> +	/* this cpu */
> +	cpu_stop_vm(NULL);
> +
> +	/* Cleaup after all stop */
> +	for_each_available_cpu(cpu) {
> +		if (jobs[cpu->pir])
> +			cpu_wait_job(jobs[cpu->pir], true);
> +	}
> +
> +	for_each_available_cpu(cpu) {
> +		if (cpu == this_cpu())
> +			continue;
> +		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_cleanup_vm",
> +						cpu_cleanup_vm, NULL);
> +	}
> +
> +	/* this cpu */
> +	cpu_cleanup_vm(NULL);
> +
> +	for_each_available_cpu(cpu) {
> +		if (jobs[cpu->pir])
> +			cpu_wait_job(jobs[cpu->pir], true);
> +	}
> +
> +	free(jobs);
> +
> +	cleanup_global_tlb();
> +}
> +
> +static void print_maps(void)
> +{
> +	struct vm_map *vmm;
> +
> +	prlog(PR_DEBUG, " %lu Global mappings\n", nr_vm_maps);
> +	list_for_each(&vm_maps, vmm, list) {
> +		prlog(PR_DEBUG, "%28s 0x%08llx-0x%08llx\n", vmm->name,
> +			vmm->address, vmm->address + vmm->length);
> +	}
> +}
> +
> +void vm_init(bool fast_reboot)
> +{
> +	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
> +	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
> +	unsigned long sym_start = (unsigned long)__sym_map_start;
> +	unsigned long sym_size = (unsigned long)__sym_map_end - sym_start;
> +	unsigned long htab_nr_bytes;
> +	unsigned long htab_nr_ptegs;
> +
> +	old_lpcr = mfspr(SPR_LPCR);
> +	new_lpcr = (old_lpcr & ~(PPC_BITMASK(0,3) | PPC_BIT(41) | PPC_BIT(43)))
> +								| PPC_BIT(54);
> +
> +	prtab = memalign(64*1024, 64*1024);
> +	assert(prtab);
> +	memset(prtab, 0, 64*1024);
> +
> +	htab_shift = 18; /* 256kB table */
> +	htab_nr_bytes = 1UL << htab_shift;
> +	htab_nr_ptegs = htab_nr_bytes / sizeof(struct hpteg);
> +	htab_pteg_mask = htab_nr_ptegs - 1;
> +	htab = memalign(1UL << htab_shift, htab_nr_bytes);
> +	assert(htab);
> +	memset(htab, 0, htab_nr_bytes);
> +
> +	prtab[0].dword[0] = cpu_to_be64((unsigned long)htab | (htab_shift - 18));
> +	prtab[0].dword[1] = 0;
> +
> +	eieio();
> +
> +	vm_init_cpu();
> +
> +	cleanup_global_tlb();
> +
> +	if (vm_globals_allocated) {
> +		assert(fast_reboot);
> +		goto done;
> +	}
> +
> +	assert(!fast_reboot);
> +	vm_globals_allocated = true;
> +
> +	vm_map_global_text("OPAL text", (unsigned long)_stext,
> +			   (unsigned long)_etext - (unsigned long)_stext);
> +	vm_map_global("OPAL rodata", (unsigned long)__rodata_start,
> +		      (unsigned long)__vm_mapped_romem_end - (unsigned long)__rodata_start,
> +		      false, false);
> +	vm_map_global("OPAL data", (unsigned long)_sdata,
> +		      (unsigned long)_edata - (unsigned long)_sdata,
> +		      true, false);
> +	vm_map_global("OPAL symbols", sym_start, sym_size, false, false);
> +	vm_map_global("OPAL bss", (unsigned long)_sbss,
> +		      (unsigned long)_ebss - (unsigned long)_sbss,
> +		      true, false);
> +	vm_map_global("OPAL heap", HEAP_BASE, HEAP_SIZE, true, false);
> +	vm_map_global("Memory console", INMEM_CON_START, INMEM_CON_LEN, true, false);
> +	vm_map_global("Hostboot console", HBRT_CON_START, HBRT_CON_LEN, false, false);
> +	vm_map_global("SPIRA heap", SPIRA_HEAP_BASE, SPIRA_HEAP_SIZE, false, false);
> +	vm_map_global("PSI TCE table", PSI_TCE_TABLE_BASE, PSI_TCE_TABLE_SIZE, false, false);
> +	vm_map_global("OPAL boot stacks", stack_start, stack_end - stack_start, true, false);
> +
> +done:
> +	prlog(PR_DEBUG, "VMM: SETUP\n");
> +	prlog(PR_DEBUG, " PRTAB:%p\n", prtab);
> +	prlog(PR_DEBUG, " HTAB: %p\n", htab);
> +	print_maps();
> +
> +	vm_setup = true;
> +
> +	vm_enter();
> +}
> +
> +void vm_init_stacks(void)
> +{
> +	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
> +	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
> +	struct cpu_thread *c = this_cpu();
> +	struct vm_map *vmm;
> +
> +	/* Can not take a d-side fault while holdig this lock */
> +	if (c->vm_setup)
> +		mtmsr(mfmsr() & ~MSR_DR);
> +	lock(&vm_maps_lock);
> +	list_for_each(&vm_maps, vmm, list) {
> +		if (vmm->address >= stack_end)
> +			continue;
> +		if (vmm->address + vmm->length <= stack_start)
> +			continue;
> +		goto found;
> +	}
> +	unlock(&vm_maps_lock);
> +	assert(0);
> +
> +found:
> +	vmm->name = "OPAL stacks";
> +	vmm->address = stack_start;
> +	vmm->length = stack_end - stack_start;
> +	unlock(&vm_maps_lock);
> +	if (c->vm_setup)
> +		mtmsr(mfmsr() | MSR_DR);
> +}
> +
> +void vm_destroy(void)
> +{
> +	assert(vm_setup);
> +
> +	prlog(PR_DEBUG, "VMM: TEARDOWN\n");
> +	print_maps();
> +
> +	cpu_all_destroy_vm();
> +
> +	vm_setup = false;
> +
> +	if (0) { /* XXX: leave for VMM enabled fast-reboot */
> +		while (!list_empty(&vm_maps)) {
> +			struct vm_map *vmm;
> +			vmm = list_pop(&vm_maps, struct vm_map, list);
> +			free(vmm);
> +		}
> +	}
> +
> +	free(htab);
> +	htab = NULL;
> +	free(prtab);
> +	prtab = NULL;
> +}
> diff --git a/hdata/spira.c b/hdata/spira.c
> index 35d6109d3..870903bd8 100644
> --- a/hdata/spira.c
> +++ b/hdata/spira.c
> @@ -1703,11 +1703,20 @@ static void fixup_spira(void)
>  static void update_spirah_addr(void)
>  {
>  #if !defined(TEST)
> +	beint64_t *spirah_offset;
> +	beint64_t *spira_offset;
> +
>  	if (proc_gen < proc_gen_p9)
>  		return;
>  
> -	naca.spirah_addr = CPU_TO_BE64(SPIRAH_OFF);
> -	naca.spira_addr = CPU_TO_BE64(SPIRA_OFF);
> +	spirah_offset = vm_map((u64)&naca, sizeof(u64), true);
> +	*spirah_offset = CPU_TO_BE64(SPIRAH_OFF);
> +	vm_unmap((unsigned long)spirah_offset, sizeof(u64));
> +
> +	spira_offset = vm_map((u64)&naca + 0x30, sizeof(u64), true);
> +	*spira_offset = CPU_TO_BE64(SPIRA_OFF);
> +	vm_unmap((unsigned long)spira_offset, sizeof(u64));
> +
>  	spirah.ntuples.hs_data_area.addr = CPU_TO_BE64(SPIRA_HEAP_BASE - SKIBOOT_BASE);
>  	spirah.ntuples.mdump_res.addr = CPU_TO_BE64(MDRT_TABLE_BASE - SKIBOOT_BASE);
>  #endif
> @@ -1715,13 +1724,24 @@ static void update_spirah_addr(void)
>  
>  int parse_hdat(bool is_opal)
>  {
> +	int ret = 0;
> +
>  	cpu_type = PVR_TYPE(mfspr(SPR_PVR));
>  
>  	prlog(PR_DEBUG, "Parsing HDAT...\n");
>  
> +	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), true, false);
>  	fixup_spira();
> +	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
>  
> +	vm_map_global("SPIRA-H", SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah), true, false);
>  	update_spirah_addr();
> +	vm_unmap_global(SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah));
> +
> +	/* Downgrade to read-only */
> +
> +	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), false, false);
> +	vm_map_global("SPIRA-H", SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah), false, false);
>  
>  	/*
>  	 * Basic DT root stuff
> @@ -1742,8 +1762,10 @@ int parse_hdat(bool is_opal)
>  	dt_init_led_node();
>  
>  	/* Parse PCIA */
> -	if (!pcia_parse())
> -		return -1;
> +	if (!pcia_parse()) {
> +		ret = -1;
> +		goto out;
> +	}
>  
>  	/* IPL params */
>  	add_iplparams();
> @@ -1789,6 +1811,9 @@ int parse_hdat(bool is_opal)
>  		node_stb_parse();
>  
>  	prlog(PR_DEBUG, "Parsing HDAT...done\n");
> +out:
> +	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
> +	vm_unmap_global(SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah));
>  
> -	return 0;
> +	return ret;
>  }
> diff --git a/hw/fake-nvram.c b/hw/fake-nvram.c
> index 44adde4a3..d1ed62e9e 100644
> --- a/hw/fake-nvram.c
> +++ b/hw/fake-nvram.c
> @@ -23,12 +23,16 @@ int fake_nvram_info(uint32_t *total_size)
>  
>  int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
>  {
> +	void *t;
> +
>  	if (!nvram_region)
>  		return -ENODEV;
>  
> +	t = vm_map(nvram_region->start + src, len, false);
>  	lock(&fake_nvram_lock);
> -	memcpy(dst, (void *) (nvram_region->start + src), len);
> +	memcpy(dst, t, len);
>  	unlock(&fake_nvram_lock);
> +	vm_unmap(nvram_region->start + src, len);
>  
>  	nvram_read_complete(true);
>  
> @@ -37,12 +41,16 @@ int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
>  
>  int fake_nvram_write(uint32_t offset, void *src, uint32_t size)
>  {
> +	void *t;
> +
>  	if (!nvram_region)
>  		return OPAL_HARDWARE;
>  
> +	t = vm_map(nvram_region->start + offset, size, true);
>  	lock(&fake_nvram_lock);
> -	memcpy((void *) (nvram_region->start + offset), src, size);
> +	memcpy(t, src, size);
>  	unlock(&fake_nvram_lock);
> +	vm_unmap(nvram_region->start + offset, size);
>  
>  	return 0;
>  }
> diff --git a/hw/homer.c b/hw/homer.c
> index c5dbd58e3..58d629d23 100644
> --- a/hw/homer.c
> +++ b/hw/homer.c
> @@ -108,6 +108,9 @@ static void homer_init_chip(struct proc_chip *chip)
>  
>  		chip->homer_base = hbase;
>  		chip->homer_size = hsize;
> +		/* slw late init and xive late init want to write to HOMER */
> +		/* XXX: make it read only until then? */
> +		vm_map_global("HOMER Image", hbase, hsize, true, false);
>  	}
>  
>  	/*
> @@ -134,13 +137,21 @@ static void homer_init_chip(struct proc_chip *chip)
>  		chip->slw_base = sbase;
>  		chip->slw_bar_size = ssize;
>  		chip->slw_image_size = ssize; /* will be adjusted later */
> +		/* XXX */
>  	}
>  
>  	if (read_pba_bar(chip, bar_occ_common, &obase, &osize)) {
> -		prlog(PR_DEBUG, "  OCC Common Area at 0x%llx size %lldMB\n",
> -		      obase, osize / 0x100000);
> +		static uint64_t homer_obase = 0;
> +
>  		chip->occ_common_base = obase;
>  		chip->occ_common_size = osize;
> +
> +		prlog(PR_DEBUG, "  OCC Common Area at 0x%llx size %lldMB\n",
> +		      obase, osize / 0x100000);
> +		if (obase != homer_obase) {
> +			vm_map_global("OCC Common Area", obase, osize, false, false);
> +			homer_obase = obase;
> +		}
>  	}
>  }
>  
> diff --git a/hw/lpc-uart.c b/hw/lpc-uart.c
> index 979a617c3..898fc4b1c 100644
> --- a/hw/lpc-uart.c
> +++ b/hw/lpc-uart.c
> @@ -59,7 +59,7 @@ static uint32_t uart_base;
>  static bool has_irq = false, irq_ok, rx_full, tx_full;
>  static uint8_t tx_room;
>  static uint8_t cached_ier;
> -static void *mmio_uart_base;
> +void *mmio_uart_base;
>  static int uart_console_policy = UART_CONSOLE_OPAL;
>  static int lpc_irq = -1;
>  
> @@ -591,6 +591,8 @@ void early_uart_init(void)
>  	if (!mmio_uart_base)
>  		return;
>  
> +	vm_map_global("UART MMIO", (unsigned long)mmio_uart_base, 8, true, true);
> +
>  	clk = dt_prop_get_u32(uart_node, "clock-frequency");
>  	baud = dt_prop_get_u32(uart_node, "current-speed");
>  
> @@ -599,6 +601,7 @@ void early_uart_init(void)
>  		prlog(PR_DEBUG, "UART: Using UART at %p\n", mmio_uart_base);
>  	} else {
>  		prerror("UART: Early init failed!");
> +		vm_unmap_global((unsigned long)mmio_uart_base, 8);
>  		mmio_uart_base = NULL;
>  	}
>  }
> @@ -610,9 +613,6 @@ void uart_init(void)
>  	char *path __unused;
>  	const be32 *irqp;
>  
> -	/* Clean up after early_uart_init() */
> -	mmio_uart_base = NULL;
> -
>  	/* UART lock is in the console path and thus must block
>  	 * printf re-entrancy
>  	 */
> @@ -630,13 +630,28 @@ void uart_init(void)
>  	 * directly mapped UARTs in simulation environments
>  	 */
>  	if (n->parent == dt_root) {
> +		void *base;
> +
>  		printf("UART: Found at root !\n");
> -		mmio_uart_base = (void *)dt_translate_address(n, 0, NULL);
> -		if (!mmio_uart_base) {
> +
> +		base = (void *)dt_translate_address(n, 0, NULL);
> +		if (!base) {
>  			printf("UART: Failed to translate address !\n");
>  			return;
>  		}
>  
> +		if (mmio_uart_base != base) {
> +			void *old;
> +
> +			vm_map_global("UART MMIO", (unsigned long)base, 8, true, true);
> +			old = mmio_uart_base;
> +			mmio_uart_base = base;
> +
> +			/* Clean up after early_uart_init() */
> +			if (old)
> +				vm_unmap_global((unsigned long)old, 8);
> +		}
> +
>  		/* If it has an interrupt properly, we consider this to be
>  		 * a direct XICS/XIVE interrupt
>  		 */
> @@ -665,6 +680,11 @@ void uart_init(void)
>  			lpc_irq = be32_to_cpu(*irqp);
>  			prlog(PR_DEBUG, "UART: Using LPC IRQ %d\n", lpc_irq);
>  		}
> +
> +		if (mmio_uart_base) {
> +//			vm_unmap_global((unsigned long)mmio_uart_base, 8);
> +			mmio_uart_base = NULL;
> +		}
>  	}
>  
>  
> diff --git a/hw/lpc.c b/hw/lpc.c
> index c2a07a0db..cb2fed2a2 100644
> --- a/hw/lpc.c
> +++ b/hw/lpc.c
> @@ -1239,6 +1239,7 @@ static void lpc_init_chip_p8(struct dt_node *xn)
>  	chip->lpc = lpc;
>  }
>  
> +void *mmio_uart_base;
>  static void lpc_init_chip_p9(struct dt_node *opb_node)
>  {
>  	uint32_t gcid = dt_get_chip_id(opb_node);
> @@ -1261,6 +1262,11 @@ static void lpc_init_chip_p9(struct dt_node *opb_node)
>  	if (!lpc_node)
>  		return;
>  
> +
> +	if (mmio_uart_base)
> +		vm_unmap_global((unsigned long)mmio_uart_base, 8);
> +	vm_map_global("LPC MMIO", addr, 0x100000000UL /* XXX: size? */, true, true);
> +
>  	lpc = zalloc(sizeof(struct lpcm));
>  	assert(lpc);
>  	lpc->chip_id = gcid;
> diff --git a/hw/phb4.c b/hw/phb4.c
> index 60e797cf6..2447c6722 100644
> --- a/hw/phb4.c
> +++ b/hw/phb4.c
> @@ -5830,6 +5830,7 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
>  	uint64_t val, phb_bar = 0, irq_bar = 0, bar_en;
>  	uint64_t mmio0_bar = 0, mmio0_bmask, mmio0_sz;
>  	uint64_t mmio1_bar = 0, mmio1_bmask, mmio1_sz;
> +	uint64_t bar_sz;
>  	void *foo;
>  	__be64 mmio_win[4];
>  	unsigned int mmio_win_sz;
> @@ -5858,7 +5859,8 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
>  	bar_en = 0;
>  
>  	/* Initialize PHB register BAR */
> -	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, NULL);
> +	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, &bar_sz);
> +	vm_map_global("PHB REGS", phb_bar, bar_sz, true, true);
>  	rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
>  			 phb_bar << 8);
>  
> @@ -5872,18 +5874,21 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
>  	bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
>  
>  	/* Same with INT BAR (ESB) */
> -	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, NULL);
> +	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, &bar_sz);
> +	vm_map_global("PHB IRQ", irq_bar, bar_sz, true, true);
>  	xscom_write(gcid, nest_stack + XPEC_NEST_STK_IRQ_BAR, irq_bar << 8);
>  	bar_en |= XPEC_NEST_STK_BAR_EN_INT;
>  
>  
>  	/* Same with MMIO windows */
>  	phys_map_get(gcid, PHB4_64BIT_MMIO, phb_num, &mmio0_bar, &mmio0_sz);
> +	vm_map_global("PHB MMIO0", mmio0_bar, mmio0_sz, true, true);
>  	mmio0_bmask =  (~(mmio0_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
>  	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0, mmio0_bar << 8);
>  	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0_MASK, mmio0_bmask << 8);
>  
>  	phys_map_get(gcid, PHB4_32BIT_MMIO, phb_num, &mmio1_bar, &mmio1_sz);
> +	vm_map_global("PHB MMIO1", mmio1_bar, mmio1_sz, true, true);
>  	mmio1_bmask =  (~(mmio1_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
>  	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1, mmio1_bar << 8);
>  	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1_MASK, mmio1_bmask << 8);
> diff --git a/hw/psi.c b/hw/psi.c
> index 63fcb257e..45f11c6b9 100644
> --- a/hw/psi.c
> +++ b/hw/psi.c
> @@ -908,6 +908,8 @@ static bool psi_init_psihb(struct dt_node *psihb)
>  
>  	list_add(&psis, &psi->list);
>  
> +	vm_map_global("PSI", (unsigned long)psi->regs, 0x100, true, true);
> +
>  	val = in_be64(psi->regs + PSIHB_CR);
>  	if (val & PSIHB_CR_FSP_LINK_ACTIVE) {
>  		lock(&psi_lock);
> diff --git a/hw/slw.c b/hw/slw.c
> index beb129a86..ccb100087 100644
> --- a/hw/slw.c
> +++ b/hw/slw.c
> @@ -151,7 +151,7 @@ static void slw_patch_reset(void)
>  		*(sav++) = *(dst);
>  		*(dst++) = *(src++);
>  	}
> -	sync_icache();
> +	sync_icache(0);
>  }
>  
>  static void slw_unpatch_reset(void)
> @@ -167,7 +167,7 @@ static void slw_unpatch_reset(void)
>  		*(dst++) = *(sav++);
>  		src++;
>  	}
> -	sync_icache();
> +	sync_icache(0);
>  }
>  
>  static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
> diff --git a/hw/xive.c b/hw/xive.c
> index 9a36f1ab2..c6aed7c9f 100644
> --- a/hw/xive.c
> +++ b/hw/xive.c
> @@ -1397,6 +1397,7 @@ static bool xive_configure_bars(struct xive *x)
>  
>  	/* IC BAR */
>  	phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size);
> +	vm_map_global("XIVE IC", (unsigned long)x->ic_base, x->ic_size, true, true);
>  	val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID;
>  	if (IC_PAGE_SIZE == 0x10000) {
>  		val |= CQ_IC_BAR_64K;
> @@ -1412,6 +1413,8 @@ static bool xive_configure_bars(struct xive *x)
>  	 * all phys_map_get(XIVE_TM) calls.
>  	 */
>  	phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size);
> +	if (chip_id == 0)
> +		vm_map_global("XIVE TM", (unsigned long)x->tm_base, x->tm_size, true, true);
>  	val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID;
>  	if (TM_PAGE_SIZE == 0x10000) {
>  		x->tm_shift = 16;
> @@ -1427,6 +1430,7 @@ static bool xive_configure_bars(struct xive *x)
>  
>  	/* PC BAR. Clear first, write mask, then write value */
>  	phys_map_get(chip_id, XIVE_PC, 0, (uint64_t *)&x->pc_base, &x->pc_size);
> +	vm_map_global("XIVE PC", (unsigned long)x->pc_base, x->pc_size, true, true);
>  	xive_regwx(x, CQ_PC_BAR, 0);
>  	if (x->last_reg_error)
>  		return false;
> @@ -1441,6 +1445,7 @@ static bool xive_configure_bars(struct xive *x)
>  
>  	/* VC BAR. Clear first, write mask, then write value */
>  	phys_map_get(chip_id, XIVE_VC, 0, (uint64_t *)&x->vc_base, &x->vc_size);
> +	vm_map_global("XIVE VC", (unsigned long)x->vc_base, x->vc_size, true, true);
>  	xive_regwx(x, CQ_VC_BAR, 0);
>  	if (x->last_reg_error)
>  		return false;
> diff --git a/hw/xscom.c b/hw/xscom.c
> index 0eda567fc..ef1a83fd4 100644
> --- a/hw/xscom.c
> +++ b/hw/xscom.c
> @@ -931,6 +931,7 @@ void xscom_init(void)
>  		const struct dt_property *reg;
>  		struct proc_chip *chip;
>  		const char *chip_name;
> +		u64 size;
>  		static const char *chip_names[] = {
>  			"UNKNOWN", "P8E", "P8", "P8NVL", "P9N", "P9C", "P9P"
>  		};
> @@ -945,6 +946,9 @@ void xscom_init(void)
>  		assert(reg);
>  
>  		chip->xscom_base = dt_translate_address(xn, 0, NULL);
> +		size = dt_property_get_u64(reg, 1);
> +
> +		vm_map_global("XSCOM MMIO", chip->xscom_base, size, true, true);
>  
>  		/* Grab processor type and EC level */
>  		xscom_init_chip_info(chip);
> diff --git a/include/cmpxchg.h b/include/cmpxchg.h
> index 0304e9134..835743cf5 100644
> --- a/include/cmpxchg.h
> +++ b/include/cmpxchg.h
> @@ -5,6 +5,9 @@
>  #define __CMPXCHG_H
>  
>  #ifndef __TEST__
> +#include <stdint.h>
> +#include <processor.h>
> +
>  /*
>   * Bare cmpxchg, no barriers.
>   */
> diff --git a/include/cpu.h b/include/cpu.h
> index 8ef20e35b..026328904 100644
> --- a/include/cpu.h
> +++ b/include/cpu.h
> @@ -12,6 +12,19 @@
>  #include <stack.h>
>  #include <timer.h>
>  
> +struct vm_map {
> +	struct list_node list;
> +
> +	const char *name;
> +	uint64_t address;
> +	uint64_t pa;
> +	uint64_t length;
> +	bool readable;
> +	bool writeable;
> +	bool executable;
> +	bool ci;
> +};
> +
>  /*
>   * cpu_thread is our internal structure representing each
>   * thread in the system
> @@ -71,10 +84,19 @@ struct cpu_thread {
>  	struct bt_entry			stack_bot_bt[CPU_BACKTRACE_SIZE];
>  	struct bt_metadata		stack_bot_bt_metadata;
>  #endif
> +	/*
> +	 * Per-thread VM parameters
> +	 */
> +	struct vm_map			vm_local_map; /* per-cpu map */
> +	bool				vm_local_map_inuse;
> +	uint8_t				vm_slb_rr; /* RR allocator */
> +	bool				vm_setup; /* virtual memory is up */
> +
>  	struct lock			job_lock;
>  	struct list_head		job_queue;
>  	uint32_t			job_count;
>  	bool				job_has_no_return;
> +
>  	/*
>  	 * Per-core mask tracking for threads in HMI handler and
>  	 * a cleanup done bit.
> diff --git a/include/elf-abi.h b/include/elf-abi.h
> index 29c757642..34b95d337 100644
> --- a/include/elf-abi.h
> +++ b/include/elf-abi.h
> @@ -21,7 +21,16 @@
>  static inline uint64_t function_entry_address(void *func)
>  {
>  #ifdef ELF_ABI_v2
> -	u32 *insn = func;
> +	u32 *ret = func;
> +	u32 *i;
> +	u32 insn;
> +	u32 insn2;
> +
> +	i = vm_map((unsigned long)func, sizeof(insn)*2, false);
> +	insn = *i;
> +	insn2 = *(i+1);
> +	vm_unmap((unsigned long)func, sizeof(insn)*2);
> +
>  	/*
>  	 * A PPC64 ABIv2 function may have a local and a global entry
>  	 * point. We use the local entry point for branch tables called
> @@ -38,12 +47,12 @@ static inline uint64_t function_entry_address(void *func)
>  	 * lis   r2,XXXX
>  	 * addi  r2,r2,XXXX
>  	 */
> -	if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
> -	     ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
> -	    ((*(insn+1) & OP_RT_RA_MASK) == ADDI_R2_R2))
> -		return (uint64_t)(insn + 2);
> +	if ((((insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
> +	     ((insn & OP_RT_RA_MASK) == LIS_R2)) &&
> +	    ((insn2 & OP_RT_RA_MASK) == ADDI_R2_R2))
> +		return (uint64_t)(ret + 2);
>  	else
> -		return (uint64_t)func;
> +		return (uint64_t)ret;
>  #else
>  	return *(uint64_t *)func;
>  #endif
> diff --git a/include/io.h b/include/io.h
> index f00021dcd..5c1bd41b4 100644
> --- a/include/io.h
> +++ b/include/io.h
> @@ -7,6 +7,7 @@
>  #ifndef __ASSEMBLY__
>  
>  #include <compiler.h>
> +#include <skiboot.h>
>  #include <stdint.h>
>  #include <processor.h>
>  #include <types.h>
> @@ -23,8 +24,13 @@
>  static inline uint8_t __in_8(const volatile uint8_t *addr)
>  {
>  	uint8_t val;
> -	asm volatile("lbzcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("lbzcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return val;
>  }
>  
> @@ -37,8 +43,13 @@ static inline uint8_t in_8(const volatile uint8_t *addr)
>  static inline uint16_t __in_be16(const volatile beint16_t *addr)
>  {
>  	__be16 val;
> -	asm volatile("lhzcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("lhzcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return be16_to_cpu(val);
>  }
>  
> @@ -51,8 +62,13 @@ static inline uint16_t in_be16(const volatile beint16_t *addr)
>  static inline uint16_t __in_le16(const volatile leint16_t *addr)
>  {
>  	__le16 val;
> -	asm volatile("lhzcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("lhzcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return le16_to_cpu(val);
>  }
>  
> @@ -65,8 +81,13 @@ static inline uint16_t in_le16(const volatile leint16_t *addr)
>  static inline uint32_t __in_be32(const volatile beint32_t *addr)
>  {
>  	__be32 val;
> -	asm volatile("lwzcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("lwzcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return be32_to_cpu(val);
>  }
>  
> @@ -79,8 +100,13 @@ static inline uint32_t in_be32(const volatile beint32_t *addr)
>  static inline uint32_t __in_le32(const volatile leint32_t *addr)
>  {
>  	__le32 val;
> -	asm volatile("lwzcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("lwzcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return le32_to_cpu(val);
>  }
>  
> @@ -93,8 +119,13 @@ static inline uint32_t in_le32(const volatile leint32_t *addr)
>  static inline uint64_t __in_be64(const volatile beint64_t *addr)
>  {
>  	__be64 val;
> -	asm volatile("ldcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("ldcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return be64_to_cpu(val);
>  }
>  
> @@ -107,8 +138,13 @@ static inline uint64_t in_be64(const volatile beint64_t *addr)
>  static inline uint64_t __in_le64(const volatile leint64_t *addr)
>  {
>  	__le64 val;
> -	asm volatile("ldcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("ldcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return le64_to_cpu(val);
>  }
>  
> @@ -120,8 +156,11 @@ static inline uint64_t in_le64(const volatile leint64_t *addr)
>  
>  static inline void __out_8(volatile uint8_t *addr, uint8_t val)
>  {
> -	asm volatile("stbcix %0,0,%1"
> -		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
> +	if (vm_realmode())
> +		asm volatile("stbcix %0,0,%1"
> +		     : : "r"(val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = val;
>  }
>  
>  static inline void out_8(volatile uint8_t *addr, uint8_t val)
> @@ -132,8 +171,12 @@ static inline void out_8(volatile uint8_t *addr, uint8_t val)
>  
>  static inline void __out_be16(volatile beint16_t *addr, uint16_t val)
>  {
> -	asm volatile("sthcix %0,0,%1"
> -		     : : "r"(cpu_to_be16(val)), "r"(addr), "m"(*addr) : "memory");
> +	__be16 __val = cpu_to_be16(val);
> +	if (vm_realmode())
> +		asm volatile("sthcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_be16(volatile beint16_t *addr, uint16_t val)
> @@ -144,8 +187,12 @@ static inline void out_be16(volatile beint16_t *addr, uint16_t val)
>  
>  static inline void __out_le16(volatile leint16_t *addr, uint16_t val)
>  {
> -	asm volatile("sthcix %0,0,%1"
> -		     : : "r"(cpu_to_le16(val)), "r"(addr), "m"(*addr) : "memory");
> +	__le16 __val = cpu_to_le16(val);
> +	if (vm_realmode())
> +		asm volatile("sthcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_le16(volatile leint16_t *addr, uint16_t val)
> @@ -156,8 +203,12 @@ static inline void out_le16(volatile leint16_t *addr, uint16_t val)
>  
>  static inline void __out_be32(volatile beint32_t *addr, uint32_t val)
>  {
> -	asm volatile("stwcix %0,0,%1"
> -		     : : "r"(cpu_to_be32(val)), "r"(addr), "m"(*addr) : "memory");
> +	__be32 __val = cpu_to_be32(val);
> +	if (vm_realmode())
> +		asm volatile("stwcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_be32(volatile beint32_t *addr, uint32_t val)
> @@ -168,8 +219,12 @@ static inline void out_be32(volatile beint32_t *addr, uint32_t val)
>  
>  static inline void __out_le32(volatile leint32_t *addr, uint32_t val)
>  {
> -	asm volatile("stwcix %0,0,%1"
> -		     : : "r"(cpu_to_le32(val)), "r"(addr), "m"(*addr) : "memory");
> +	__le32 __val = cpu_to_le32(val);
> +	if (vm_realmode())
> +		asm volatile("stwcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_le32(volatile leint32_t *addr, uint32_t val)
> @@ -180,8 +235,12 @@ static inline void out_le32(volatile leint32_t *addr, uint32_t val)
>  
>  static inline void __out_be64(volatile beint64_t *addr, uint64_t val)
>  {
> -	asm volatile("stdcix %0,0,%1"
> -		     : : "r"(cpu_to_be64(val)), "r"(addr), "m"(*addr) : "memory");
> +	__be64 __val = cpu_to_be64(val);
> +	if (vm_realmode())
> +		asm volatile("stdcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_be64(volatile beint64_t *addr, uint64_t val)
> @@ -192,8 +251,12 @@ static inline void out_be64(volatile beint64_t *addr, uint64_t val)
>  
>  static inline void __out_le64(volatile leint64_t *addr, uint64_t val)
>  {
> -	asm volatile("stdcix %0,0,%1"
> -		     : : "r"(cpu_to_le64(val)), "r"(addr), "m"(*addr) : "memory");
> +	__le64 __val = cpu_to_le64(val);
> +	if (vm_realmode())
> +		asm volatile("stdcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_le64(volatile leint64_t *addr, uint64_t val)
> diff --git a/include/mem_region.h b/include/mem_region.h
> index 3e3818a66..47c3bd70c 100644
> --- a/include/mem_region.h
> +++ b/include/mem_region.h
> @@ -33,6 +33,7 @@ struct mem_region {
>  	struct list_node list;
>  	const char *name;
>  	uint64_t start, len;
> +	uint64_t vm_mapped_len;
>  	struct dt_node *node;
>  	enum mem_region_type type;
>  	struct list_head free_list;
> diff --git a/include/platform.h b/include/platform.h
> index 6aa263ae0..e431a5fe0 100644
> --- a/include/platform.h
> +++ b/include/platform.h
> @@ -298,8 +298,8 @@ struct platform {
>  	void (*vpd_iohub_load)(struct dt_node *hub_node);
>  };
>  
> -extern struct platform __platforms_start;
> -extern struct platform __platforms_end;
> +extern struct platform __platforms_start[];
> +extern struct platform __platforms_end[];
>  
>  extern struct platform	platform;
>  extern const struct bmc_platform *bmc_platform;
> diff --git a/include/processor.h b/include/processor.h
> index 7ba251bb4..9d197ffc1 100644
> --- a/include/processor.h
> +++ b/include/processor.h
> @@ -39,7 +39,9 @@
>  #define SPR_SRR1	0x01b	/* RW: Exception save/restore reg 1 */
>  #define SPR_CFAR	0x01c	/* RW: Come From Address Register */
>  #define SPR_AMR		0x01d	/* RW: Authority Mask Register */
> +#define SPR_PID		0x030	/* RW: PID register */
>  #define SPR_IAMR	0x03d	/* RW: Instruction Authority Mask Register */
> +#define SPR_UAMOR	0x09d
>  #define SPR_RPR		0x0ba   /* RW: Relative Priority Register */
>  #define SPR_TBRL	0x10c	/* RO: Timebase low */
>  #define SPR_TBRU	0x10d	/* RO: Timebase high */
> @@ -61,10 +63,12 @@
>  #define SPR_HSRR1	0x13b	/* RW: HV Exception save/restore reg 1 */
>  #define SPR_TFMR	0x13d
>  #define SPR_LPCR	0x13e
> +#define SPR_LPID	0x13f	/* RW: LPID register */
>  #define SPR_HMER	0x150	/* Hypervisor Maintenance Exception */
>  #define SPR_HMEER	0x151	/* HMER interrupt enable mask */
>  #define SPR_PCR		0x152
>  #define SPR_AMOR	0x15d
> +#define SPR_PTCR	0x1d0	/* RW: Partition table control register */
>  #define SPR_PSSCR	0x357   /* RW: Stop status and control (ISA 3) */
>  #define SPR_TSCR	0x399
>  #define SPR_HID0	0x3f0
> @@ -80,6 +84,11 @@
>  #define SPR_SRR1_PM_WAKE_SRESET	0x100000
>  #define SPR_SRR1_PM_WAKE_MCE	0x3c0000	/* Use reserved value for MCE */
>  
> +/* Bits in DSISR */
> +
> +#define	DSISR_ISSTORE		0x02000000
> +
> +
>  /* Bits in LPCR */
>  
>  /* Powersave Exit Cause Enable is different on each generation */
> @@ -318,9 +327,9 @@ static inline void isync(void)
>  /*
>   * Cache sync
>   */
> -static inline void sync_icache(void)
> +static inline void sync_icache(unsigned long ptr)
>  {
> -	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (0) : "memory");
> +	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (ptr) : "memory");
>  }
>  
>  /*
> diff --git a/include/skiboot.h b/include/skiboot.h
> index 30ff500c5..aacb425f7 100644
> --- a/include/skiboot.h
> +++ b/include/skiboot.h
> @@ -42,10 +42,16 @@ extern char _stext[];
>  extern char _etext[];
>  extern char __sym_map_end[];
>  extern char _romem_end[];
> +extern char __vm_mapped_romem_end[];
>  
>  #ifndef __TESTING__
> +extern char _stext[], _etext[];
>  /* Readonly section start and end. */
>  extern char __rodata_start[], __rodata_end[];
> +extern char _sdata[], _edata[];
> +extern char __sym_map_start[], __sym_map_end[];
> +extern char _sbss[], _ebss[];
> +extern char _end[];
>  
>  static inline bool is_rodata(const void *p)
>  {
> @@ -184,6 +190,7 @@ extern void disable_fast_reboot(const char *reason);
>  extern void add_fast_reboot_dt_entries(void);
>  extern void fast_reboot(void);
>  extern void __noreturn __secondary_cpu_entry(void);
> +extern void __noreturn __return_cpu_entry(void);
>  extern void __noreturn load_and_boot_kernel(bool is_reboot);
>  extern void cleanup_local_tlb(void);
>  extern void cleanup_global_tlb(void);
> @@ -336,4 +343,24 @@ extern int fake_nvram_info(uint32_t *total_size);
>  extern int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len);
>  extern int fake_nvram_write(uint32_t offset, void *src, uint32_t size);
>  
> +/* core/vm.c */
> +bool vm_realmode(void);
> +void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci);
> +void vm_map_global_text(const char *name, unsigned long addr, unsigned long len);
> +void vm_unmap_global(unsigned long addr, unsigned long len);
> +void *vm_map(unsigned long addr, unsigned long len, bool rw);
> +void vm_unmap(unsigned long addr, unsigned long len);
> +void vm_init(bool fast_reboot);
> +void vm_init_stacks(void);
> +void vm_destroy(void);
> +void vm_init_secondary(void);
> +void vm_enter(void);
> +void vm_exit(void);
> +void vm_exit_cleanup(void);
> +void vm_map_stacks(void);
> +bool vm_dslb(uint64_t nia, uint64_t dar);
> +bool vm_islb(uint64_t nia);
> +bool vm_dsi(uint64_t nia, uint64_t dar, uint32_t dsisr);
> +bool vm_isi(uint64_t nia);
> +
>  #endif /* __SKIBOOT_H */
> diff --git a/libstb/container.c b/libstb/container.c
> index eca54cf63..2b8f22f70 100644
> --- a/libstb/container.c
> +++ b/libstb/container.c
> @@ -6,14 +6,20 @@
>  
>  bool stb_is_container(const void *buf, size_t size)
>  {
> +	beint32_t *t;
>  	ROM_container_raw *c;
> +	bool ret = true;;
>  
>  	c = (ROM_container_raw*) buf;
>  	if (!buf || size < SECURE_BOOT_HEADERS_SIZE)
>  		return false;
> -	if (be32_to_cpu(c->magic_number) != ROM_MAGIC_NUMBER )
> -		return false;
> -	return true;
> +
> +	t = vm_map((unsigned long)&c->magic_number, sizeof(*t), false);
> +	if (be32_to_cpu(*t) != ROM_MAGIC_NUMBER)
> +		ret = false;
> +	vm_unmap((unsigned long)&c->magic_number, sizeof(*t));
> +
> +	return ret;
>  }
>  
>  uint32_t stb_payload_magic(const void *buf, size_t size)
> diff --git a/libstb/cvc.c b/libstb/cvc.c
> index 663e53953..08b2eea60 100644
> --- a/libstb/cvc.c
> +++ b/libstb/cvc.c
> @@ -155,6 +155,9 @@ static int cvc_reserved_mem_init(struct dt_node *parent) {
>  		return -1;
>  	}
>  	addr = dt_get_address(cvc_resv_mem, 0, &size);
> +	if (size == 0) // MAMBO HACK
> +		size = 64*1024;
> +	vm_map_global_text("STB-CVC", addr, size);
>  	cvc_register(addr, addr + size-1);
>  
>  	exports = dt_find_by_path(dt_root, "/ibm,opal/firmware/exports");
> diff --git a/libstb/secureboot.c b/libstb/secureboot.c
> index c86972161..dc3bda3d2 100644
> --- a/libstb/secureboot.c
> +++ b/libstb/secureboot.c
> @@ -164,6 +164,7 @@ int secureboot_verify(enum resource_id id, void *buf, size_t len)
>  {
>  	const char *name;
>  	__be64 log;
> +	void *vbuf;
>  	int rc = -1;
>  
>  	name = flash_map_resource_name(id);
> @@ -181,7 +182,9 @@ int secureboot_verify(enum resource_id id, void *buf, size_t len)
>  		return -1;
>          }
>  
> -	rc = call_cvc_verify(buf, len, hw_key_hash, hw_key_hash_size, &log);
> +	vbuf = vm_map((unsigned long)buf, len, false);
> +	rc = call_cvc_verify(vbuf, len, hw_key_hash, hw_key_hash_size, &log);
> +	vm_unmap((unsigned long)buf, len);
>  
>  	if (rc == OPAL_SUCCESS) {
>  		prlog(PR_NOTICE, "%s verified\n", name);
> diff --git a/libstb/trustedboot.c b/libstb/trustedboot.c
> index 413862e63..910354f7b 100644
> --- a/libstb/trustedboot.c
> +++ b/libstb/trustedboot.c
> @@ -161,7 +161,7 @@ out_free:
>  int trustedboot_measure(enum resource_id id, void *buf, size_t len)
>  {
>  	uint8_t digest[SHA512_DIGEST_LENGTH];
> -	void *buf_aux;
> +	void *buf_aux, *vbuf;
>  	size_t len_aux;
>  	const char *name;
>  	TPM_Pcr pcr;
> @@ -219,7 +219,9 @@ int trustedboot_measure(enum resource_id id, void *buf, size_t len)
>  		len_aux = len;
>  	}
>  
> -	rc = call_cvc_sha512(buf_aux, len_aux, digest, SHA512_DIGEST_LENGTH);
> +	vbuf = vm_map((unsigned long)buf_aux, len_aux, false);
> +	rc = call_cvc_sha512(vbuf, len_aux, digest, SHA512_DIGEST_LENGTH);
> +	vm_unmap((unsigned long)buf_aux, len_aux);
>  
>  	if (rc == OPAL_SUCCESS) {
>  		prlog(PR_NOTICE, "%s hash calculated\n", name);
> diff --git a/skiboot.lds.S b/skiboot.lds.S
> index b136e4004..9d21681ab 100644
> --- a/skiboot.lds.S
> +++ b/skiboot.lds.S
> @@ -123,12 +123,26 @@ SECTIONS
>  		__rodata_end = .;
>  	}
>  
> +	. = ALIGN(0x100);
> +	.got : {
> +		__toc_start = . + 0x8000;
> +		*(.got)
> +		*(.toc)
> +	}
> +
> +	. = ALIGN(0x10);
> +	.opd : {
> +		*(.opd)
> +	}
> +
>  	. = ALIGN(0x10);
>  	.trap_table : {
>  		__trap_table_start = .;
>  		KEEP(*(.trap_table))
>  		__trap_table_end = .;
>  	}
> +	__vm_mapped_romem_end = .;
> +	. = ALIGN(PAGE_SIZE);
>  
>  	. = ALIGN(0x10);
>  	.init : {
> @@ -139,18 +153,6 @@ SECTIONS
>  		__ctors_end = .;
>  	}
>  
> -	. = ALIGN(0x10);
> -	.opd : {
> -		*(.opd)
> -	}
> -  
> -	. = ALIGN(0x100);
> -	.got : {
> -		__toc_start = . + 0x8000;
> -		*(.got)
> -		*(.toc)
> -	}
> -
>  	. = ALIGN(0x10);
>  	.opal_table : {
>  		__opal_table_start = .;
>