[RFC PATCH 1/3] powerpc/64s: Fix system call emulation

Nicholas Piggin npiggin at gmail.com
Tue Jan 18 01:24:21 AEDT 2022


Interrupt return code expects the returned-to irq state to be reconciled
with the returned-to MSR[EE] state. That is, if local irqs are enabled
then MSR[EE] must be set, and if MSR[EE] is not set then local irqs must
be disabled.

System call emulation (both sc and scv 0) does not get this right, it
tries to return to a context with MSR[EE]=0 and local irqs enabled. This
confuses interrupt return and triggers a warning and probably worse.

Fix this by returning to the system call entry points with MSR[EE]=0 and
local irqs disabled. Add a case to deal with this kind of entry in the
system call handler. The difficulty is that the interrupt return changes
from user-mode to kernel mode, so it does not restore everything to the
way it would look when coming from userspace (e.g., CPU accounting, kuap,
the pkey regs, etc).

XXX: I don't know if this is quite the best problem. System call emulation
is much more complicated than it looks due to this return-to-kernel problem.
Even now the patch relies on SOFTE being set in the stack by the interrupt
return reading back the same way by the system call handler that creates
a new stack at the same position (this is how it determines it was an
emulated syscall). Not only that but suspect the IAMR is not being restored
correctly here and the correct user value on the stack gets clobbered.

Better option might be to have per-thread data that sets an emulated
syscall required flag and saves certain things like iamr. Or possibly just
bite the bullet and create new entry points for syscall emulation.
---
 arch/powerpc/kernel/interrupt.c    | 35 ++++++++++++++++++++----------
 arch/powerpc/kernel/interrupt_64.S | 10 ---------
 arch/powerpc/lib/sstep.c           |  9 +++++---
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 7cd6ce3ec423..e73ad5842cb0 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -81,10 +81,31 @@ notrace long system_call_exception(long r3, long r4, long r5,
 {
 	syscall_fn f;
 
-	kuap_lock();
-
 	regs->orig_gpr3 = r3;
 
+	if (IS_ENABLED(CONFIG_PPC64) &&
+			unlikely(arch_irq_disabled_regs(regs))) {
+		irq_soft_mask_regs_set_state(regs, IRQS_ENABLED);
+		/*
+		 * The first stack frame entry will have IRQS_ENABLED except
+		 * in the case of syscall emulation, where the syscall entry
+		 * code is returned-to with MSR[EE] disabled, which requires
+		 * regs->softe is IRQS_DISABLED to avoid triggering the
+		 * interrupt return code warning for returning to local irqs
+		 * enabled but MSR[EE]=0. Not a big deal to re-set it here.
+		 */
+#ifdef CONFIG_PPC_BOOK3S_64
+		set_kuap(AMR_KUAP_BLOCKED);
+#endif
+		if (trap_is_scv(regs))
+			local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+		/* XXX: pkey save? Did we save the wrong values in stack
+		 * from userspace now? */
+		goto skip_user_entry;
+	}
+
+	kuap_lock();
+
 	if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
 		BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
 
@@ -95,7 +116,6 @@ notrace long system_call_exception(long r3, long r4, long r5,
 
 	BUG_ON(regs_is_unrecoverable(regs));
 	BUG_ON(!(regs->msr & MSR_PR));
-	BUG_ON(arch_irq_disabled_regs(regs));
 
 #ifdef CONFIG_PPC_PKEY
 	if (mmu_has_feature(MMU_FTR_PKEY)) {
@@ -129,14 +149,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 
 	account_stolen_time();
 
-	/*
-	 * This is not required for the syscall exit path, but makes the
-	 * stack frame look nicer. If this was initialised in the first stack
-	 * frame, or if the unwinder was taught the first stack frame always
-	 * returns to user with IRQS_ENABLED, this store could be avoided!
-	 */
-	irq_soft_mask_regs_set_state(regs, IRQS_ENABLED);
-
+skip_user_entry:
 	/*
 	 * If system call is called with TM active, set _TIF_RESTOREALL to
 	 * prevent RFSCV being used to return to userspace, because POWER9
diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S
index 7bab2d7de372..6471034c7909 100644
--- a/arch/powerpc/kernel/interrupt_64.S
+++ b/arch/powerpc/kernel/interrupt_64.S
@@ -219,16 +219,6 @@ system_call_vectored common 0x3000
  */
 system_call_vectored sigill 0x7ff0
 
-
-/*
- * Entered via kernel return set up by kernel/sstep.c, must match entry regs
- */
-	.globl system_call_vectored_emulate
-system_call_vectored_emulate:
-_ASM_NOKPROBE_SYMBOL(system_call_vectored_emulate)
-	li	r10,IRQS_ALL_DISABLED
-	stb	r10,PACAIRQSOFTMASK(r13)
-	b	system_call_vectored_common
 #endif /* CONFIG_PPC_BOOK3S */
 
 	.balign IFETCH_ALIGN_BYTES
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index a94b0cd0bdc5..62d3fd925dde 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -16,7 +16,7 @@
 #include <asm/disassemble.h>
 
 extern char system_call_common[];
-extern char system_call_vectored_emulate[];
+extern char system_call_vectored_common[];
 
 #ifdef CONFIG_PPC64
 /* Bits in SRR1 that are copied from MSR */
@@ -3667,6 +3667,8 @@ int emulate_step(struct pt_regs *regs, ppc_inst_t instr)
 		regs->gpr[11] = regs->nip + 4;
 		regs->gpr[12] = regs->msr & MSR_MASK;
 		regs->gpr[13] = (unsigned long) get_paca();
+		// Return code needs regs->softe to match regs->msr & MSR_EE
+		regs->softe = IRQS_ALL_DISABLED;
 		regs_set_return_ip(regs, (unsigned long) &system_call_common);
 		regs_set_return_msr(regs, MSR_KERNEL);
 		return 1;
@@ -3674,11 +3676,12 @@ int emulate_step(struct pt_regs *regs, ppc_inst_t instr)
 #ifdef CONFIG_PPC_BOOK3S_64
 	case SYSCALL_VECTORED_0:	/* scv 0 */
 		regs->gpr[9] = regs->gpr[13];
-		regs->gpr[10] = MSR_KERNEL;
 		regs->gpr[11] = regs->nip + 4;
 		regs->gpr[12] = regs->msr & MSR_MASK;
 		regs->gpr[13] = (unsigned long) get_paca();
-		regs_set_return_ip(regs, (unsigned long) &system_call_vectored_emulate);
+		// Return code needs regs->softe to match regs->msr & MSR_EE
+		regs->softe = IRQS_ALL_DISABLED;
+		regs_set_return_ip(regs, (unsigned long) &system_call_vectored_common);
 		regs_set_return_msr(regs, MSR_KERNEL);
 		return 1;
 #endif
-- 
2.23.0



More information about the Linuxppc-dev mailing list