[PATCH v3 25/32] powerpc/64: system call implement entry/exit logic in C
Christophe Leroy
christophe.leroy at c-s.fr
Thu Mar 19 20:18:12 AEDT 2020
Le 25/02/2020 à 18:35, Nicholas Piggin a écrit :
> System call entry and particularly exit code is beyond the limit of what
> is reasonable to implement in asm.
>
> This conversion moves all conditional branches out of the asm code,
> except for the case that all GPRs should be restored at exit.
>
> Null syscall test is about 5% faster after this patch, because the exit
> work is handled under local_irq_disable, and the hard mask and pending
> interrupt replay is handled after that, which avoids games with MSR.
>
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
> Signed-off-by: Michal Suchanek <msuchanek at suse.de>
> ---
>
> v2,rebase (from Michal):
> - Add endian conversion for dtl_idx (ms)
> - Fix sparse warning about missing declaration (ms)
> - Add unistd.h to fix some defconfigs, add SPDX, minor formatting (mpe)
>
> v3: Fixes thanks to reports from mpe and selftests errors:
> - Several soft-mask debug and unsafe smp_processor_id() warnings due to
> tracing and other false positives due to checks in "unreconciled" code.
> - Fix a bug with syscall tracing functions that set registers (e.g.,
> PTRACE_SETREG) not setting GPRs properly.
> - Fix silly tabort_syscall bug that causes kernel crashes when making system
> calls in transactional state.
>
> arch/powerpc/include/asm/asm-prototypes.h | 17 +-
> .../powerpc/include/asm/book3s/64/kup-radix.h | 14 +-
> arch/powerpc/include/asm/cputime.h | 29 ++
> arch/powerpc/include/asm/hw_irq.h | 4 +
> arch/powerpc/include/asm/ptrace.h | 3 +
> arch/powerpc/include/asm/signal.h | 3 +
> arch/powerpc/include/asm/switch_to.h | 5 +
> arch/powerpc/include/asm/time.h | 3 +
> arch/powerpc/kernel/Makefile | 3 +-
> arch/powerpc/kernel/entry_64.S | 338 +++---------------
> arch/powerpc/kernel/signal.h | 2 -
> arch/powerpc/kernel/syscall_64.c | 213 +++++++++++
> arch/powerpc/kernel/systbl.S | 9 +-
> 13 files changed, 328 insertions(+), 315 deletions(-)
> create mode 100644 arch/powerpc/kernel/syscall_64.c
>
> diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
> index 983c0084fb3f..4b3609554e76 100644
> --- a/arch/powerpc/include/asm/asm-prototypes.h
> +++ b/arch/powerpc/include/asm/asm-prototypes.h
> @@ -97,6 +97,12 @@ ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp,
> unsigned long __init early_init(unsigned long dt_ptr);
> void __init machine_init(u64 dt_ptr);
> #endif
> +#ifdef CONFIG_PPC64
This ifdef is not necessary as it has no pending #else.
Having function declaration without definition is not an issue.
Keeping in mind that we are aiming at generalising this to PPC32.
> +long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, unsigned long r0, struct pt_regs *regs);
> +notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs *regs);
> +notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr);
> +notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr);
> +#endif
>
> long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low,
> u32 len_high, u32 len_low);
> @@ -104,14 +110,6 @@ long sys_switch_endian(void);
> notrace unsigned int __check_irq_replay(void);
> void notrace restore_interrupts(void);
>
> -/* ptrace */
> -long do_syscall_trace_enter(struct pt_regs *regs);
> -void do_syscall_trace_leave(struct pt_regs *regs);
> -
> -/* process */
> -void restore_math(struct pt_regs *regs);
> -void restore_tm_state(struct pt_regs *regs);
> -
> /* prom_init (OpenFirmware) */
> unsigned long __init prom_init(unsigned long r3, unsigned long r4,
> unsigned long pp,
> @@ -122,9 +120,6 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
> void __init early_setup(unsigned long dt_ptr);
> void early_setup_secondary(void);
>
> -/* time */
> -void accumulate_stolen_time(void);
> -
> /* misc runtime */
> extern u64 __bswapdi2(u64);
> extern s64 __lshrdi3(s64, int);
> diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h
> index 90dd3a3fc8c7..71081d90f999 100644
> --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
> @@ -3,6 +3,7 @@
> #define _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H
>
> #include <linux/const.h>
> +#include <asm/reg.h>
>
> #define AMR_KUAP_BLOCK_READ UL(0x4000000000000000)
> #define AMR_KUAP_BLOCK_WRITE UL(0x8000000000000000)
> @@ -56,7 +57,14 @@
>
> #ifdef CONFIG_PPC_KUAP
>
> -#include <asm/reg.h>
> +#include <asm/mmu.h>
> +#include <asm/ptrace.h>
> +
> +static inline void kuap_check_amr(void)
> +{
> + if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_RADIX_KUAP))
> + WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED);
> +}
>
> /*
> * We support individually allowing read or write, but we don't support nesting
> @@ -127,6 +135,10 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
> (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)),
> "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
> }
> +#else /* CONFIG_PPC_KUAP */
> +static inline void kuap_check_amr(void)
> +{
> +}
> #endif /* CONFIG_PPC_KUAP */
>
> #endif /* __ASSEMBLY__ */
> diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
> index 2431b4ada2fa..6639a6847cc0 100644
> --- a/arch/powerpc/include/asm/cputime.h
> +++ b/arch/powerpc/include/asm/cputime.h
> @@ -44,6 +44,28 @@ static inline unsigned long cputime_to_usecs(const cputime_t ct)
> #ifdef CONFIG_PPC64
> #define get_accounting(tsk) (&get_paca()->accounting)
> static inline void arch_vtime_task_switch(struct task_struct *tsk) { }
Could we have the below additions sit outside of this PPC64 ifdef, to be
reused on PPC32 ?
> +
> +/*
> + * account_cpu_user_entry/exit runs "unreconciled", so can't trace,
> + * can't use use get_paca()
> + */
> +static notrace inline void account_cpu_user_entry(void)
> +{
> + unsigned long tb = mftb();
> + struct cpu_accounting_data *acct = &local_paca->accounting;
In the spirit of reusing that code on PPC32, can we use get_accounting()
? Or an alternate version of get_accounting(), eg
get_accounting_notrace() to be defined ?
> +
> + acct->utime += (tb - acct->starttime_user);
> + acct->starttime = tb;
> +}
> +static notrace inline void account_cpu_user_exit(void)
> +{
> + unsigned long tb = mftb();
> + struct cpu_accounting_data *acct = &local_paca->accounting;
> +
> + acct->stime += (tb - acct->starttime);
> + acct->starttime_user = tb;
> +}
> +
> #else
> #define get_accounting(tsk) (&task_thread_info(tsk)->accounting)
> /*
> @@ -61,5 +83,12 @@ static inline void arch_vtime_task_switch(struct task_struct *prev)
> #endif
>
> #endif /* __KERNEL__ */
> +#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
> +static inline void account_cpu_user_entry(void)
> +{
> +}
> +static inline void account_cpu_user_exit(void)
> +{
> +}
> #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
> #endif /* __POWERPC_CPUTIME_H */
> diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
> index e3a905e3d573..310583e62bd9 100644
> --- a/arch/powerpc/include/asm/hw_irq.h
> +++ b/arch/powerpc/include/asm/hw_irq.h
> @@ -228,9 +228,13 @@ static inline bool arch_irqs_disabled(void)
> #ifdef CONFIG_PPC_BOOK3E
> #define __hard_irq_enable() wrtee(MSR_EE)
> #define __hard_irq_disable() wrtee(0)
> +#define __hard_EE_RI_disable() wrtee(0)
> +#define __hard_RI_enable() do { } while (0)
> #else
> #define __hard_irq_enable() __mtmsrd(MSR_EE|MSR_RI, 1)
> #define __hard_irq_disable() __mtmsrd(MSR_RI, 1)
> +#define __hard_EE_RI_disable() __mtmsrd(0, 1)
> +#define __hard_RI_enable() __mtmsrd(MSR_RI, 1)
> #endif
>
> #define hard_irq_disable() do { \
> diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
> index ee3ada66deb5..082a40153b94 100644
> --- a/arch/powerpc/include/asm/ptrace.h
> +++ b/arch/powerpc/include/asm/ptrace.h
> @@ -138,6 +138,9 @@ extern unsigned long profile_pc(struct pt_regs *regs);
> #define profile_pc(regs) instruction_pointer(regs)
> #endif
>
> +long do_syscall_trace_enter(struct pt_regs *regs);
> +void do_syscall_trace_leave(struct pt_regs *regs);
> +
> #define kernel_stack_pointer(regs) ((regs)->gpr[1])
> static inline int is_syscall_success(struct pt_regs *regs)
> {
> diff --git a/arch/powerpc/include/asm/signal.h b/arch/powerpc/include/asm/signal.h
> index 0803ca8b9149..99e1c6de27bc 100644
> --- a/arch/powerpc/include/asm/signal.h
> +++ b/arch/powerpc/include/asm/signal.h
> @@ -6,4 +6,7 @@
> #include <uapi/asm/signal.h>
> #include <uapi/asm/ptrace.h>
>
> +struct pt_regs;
> +void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
> +
> #endif /* _ASM_POWERPC_SIGNAL_H */
> diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
> index 5b03d8a82409..476008bc3d08 100644
> --- a/arch/powerpc/include/asm/switch_to.h
> +++ b/arch/powerpc/include/asm/switch_to.h
> @@ -5,6 +5,7 @@
> #ifndef _ASM_POWERPC_SWITCH_TO_H
> #define _ASM_POWERPC_SWITCH_TO_H
>
> +#include <linux/sched.h>
> #include <asm/reg.h>
>
> struct thread_struct;
> @@ -22,6 +23,10 @@ extern void switch_booke_debug_regs(struct debug_reg *new_debug);
>
> extern int emulate_altivec(struct pt_regs *);
>
> +void restore_math(struct pt_regs *regs);
> +
> +void restore_tm_state(struct pt_regs *regs);
> +
> extern void flush_all_to_thread(struct task_struct *);
> extern void giveup_all(struct task_struct *);
>
> diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
> index e0107495c4de..39ce95016a3a 100644
> --- a/arch/powerpc/include/asm/time.h
> +++ b/arch/powerpc/include/asm/time.h
> @@ -194,5 +194,8 @@ DECLARE_PER_CPU(u64, decrementers_next_tb);
> /* Convert timebase ticks to nanoseconds */
> unsigned long long tb_to_ns(unsigned long long tb_ticks);
>
> +/* SPLPAR */
> +void accumulate_stolen_time(void);
> +
> #endif /* __KERNEL__ */
> #endif /* __POWERPC_TIME_H */
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 78a1b22d4fd8..5700231a8988 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -50,7 +50,8 @@ obj-y := cputable.o ptrace.o syscalls.o \
> of_platform.o prom_parse.o
> obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \
> signal_64.o ptrace32.o \
> - paca.o nvram_64.o firmware.o note.o
> + paca.o nvram_64.o firmware.o note.o \
> + syscall_64.o
> obj-$(CONFIG_VDSO32) += vdso32/
> obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o
> obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
[snip]
> diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
> index 800433685888..d396efca4068 100644
> --- a/arch/powerpc/kernel/signal.h
> +++ b/arch/powerpc/kernel/signal.h
> @@ -10,8 +10,6 @@
> #ifndef _POWERPC_ARCH_SIGNAL_H
> #define _POWERPC_ARCH_SIGNAL_H
>
> -extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
> -
> extern void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
> size_t frame_size, int is_32);
>
> diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
Could some part of it go in a syscall.c to be reused on PPC32 ?
> new file mode 100644
> index 000000000000..20f77cc19df8
> --- /dev/null
> +++ b/arch/powerpc/kernel/syscall_64.c
> @@ -0,0 +1,213 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#include <linux/err.h>
> +#include <asm/asm-prototypes.h>
> +#include <asm/book3s/64/kup-radix.h>
> +#include <asm/cputime.h>
> +#include <asm/hw_irq.h>
> +#include <asm/kprobes.h>
> +#include <asm/paca.h>
> +#include <asm/ptrace.h>
> +#include <asm/reg.h>
> +#include <asm/signal.h>
> +#include <asm/switch_to.h>
> +#include <asm/syscall.h>
> +#include <asm/time.h>
> +#include <asm/unistd.h>
> +
> +extern void __noreturn tabort_syscall(unsigned long nip, unsigned long msr);
> +
> +typedef long (*syscall_fn)(long, long, long, long, long, long);
> +
> +/* Has to run notrace because it is entered "unreconciled" */
> +notrace long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8,
> + unsigned long r0, struct pt_regs *regs)
> +{
> + unsigned long ti_flags;
> + syscall_fn f;
> +
> + BUG_ON(!(regs->msr & MSR_PR));
> +
> + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
> + unlikely(regs->msr & MSR_TS_T))
> + tabort_syscall(regs->nip, regs->msr);
> +
> + account_cpu_user_entry();
> +
> +#ifdef CONFIG_PPC_SPLPAR
> + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) &&
> + firmware_has_feature(FW_FEATURE_SPLPAR)) {
> + struct lppaca *lp = local_paca->lppaca_ptr;
> +
> + if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx)))
> + accumulate_stolen_time();
> + }
> +#endif
> +
> + kuap_check_amr();
> +
> + /*
> + * A syscall should always be called with interrupts enabled
> + * so we just unconditionally hard-enable here. When some kind
> + * of irq tracing is used, we additionally check that condition
> + * is correct
> + */
> + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) {
> + WARN_ON(irq_soft_mask_return() != IRQS_ENABLED);
> + WARN_ON(local_paca->irq_happened);
> + }
> + /*
> + * This is not required for the syscall exit path, but makes the
> + * stack frame look nicer. If this was initialised in the first stack
> + * frame, or if the unwinder was taught the first stack frame always
> + * returns to user with IRQS_ENABLED, this store could be avoided!
> + */
> + regs->softe = IRQS_ENABLED;
softe doesn't exist on PPC32. Can we do that through a helper ?
> +
> + __hard_irq_enable();
This doesn't exist on PPC32. Should we define __hard_irq_enable() as
arch_local_irq_enable() on PPC32 ?
> +
> + ti_flags = current_thread_info()->flags;
> + if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
> + /*
> + * We use the return value of do_syscall_trace_enter() as the
> + * syscall number. If the syscall was rejected for any reason
> + * do_syscall_trace_enter() returns an invalid syscall number
> + * and the test against NR_syscalls will fail and the return
> + * value to be used is in regs->gpr[3].
> + */
> + r0 = do_syscall_trace_enter(regs);
> + if (unlikely(r0 >= NR_syscalls))
> + return regs->gpr[3];
> + r3 = regs->gpr[3];
> + r4 = regs->gpr[4];
> + r5 = regs->gpr[5];
> + r6 = regs->gpr[6];
> + r7 = regs->gpr[7];
> + r8 = regs->gpr[8];
> +
> + } else if (unlikely(r0 >= NR_syscalls)) {
> + return -ENOSYS;
> + }
> +
> + /* May be faster to do array_index_nospec? */
> + barrier_nospec();
> +
> + if (unlikely(ti_flags & _TIF_32BIT)) {
Use is_compat_task() instead ?
> + f = (void *)compat_sys_call_table[r0];
> +
> + r3 &= 0x00000000ffffffffULL;
> + r4 &= 0x00000000ffffffffULL;
> + r5 &= 0x00000000ffffffffULL;
> + r6 &= 0x00000000ffffffffULL;
> + r7 &= 0x00000000ffffffffULL;
> + r8 &= 0x00000000ffffffffULL;
> +
> + } else {
> + f = (void *)sys_call_table[r0];
> + }
> +
> + return f(r3, r4, r5, r6, r7, r8);
> +}
> +
> +/*
> + * This should be called after a syscall returns, with r3 the return value
> + * from the syscall. If this function returns non-zero, the system call
> + * exit assembly should additionally load all GPR registers and CTR and XER
> + * from the interrupt frame.
> + *
> + * The function graph tracer can not trace the return side of this function,
> + * because RI=0 and soft mask state is "unreconciled", so it is marked notrace.
> + */
> +notrace unsigned long syscall_exit_prepare(unsigned long r3,
> + struct pt_regs *regs)
> +{
> + unsigned long *ti_flagsp = ¤t_thread_info()->flags;
> + unsigned long ti_flags;
> + unsigned long ret = 0;
> +
> + regs->result = r3;
> +
> + /* Check whether the syscall is issued inside a restartable sequence */
> + rseq_syscall(regs);
> +
> + ti_flags = *ti_flagsp;
> +
> + if (unlikely(r3 >= (unsigned long)-MAX_ERRNO)) {
> + if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
> + r3 = -r3;
> + regs->ccr |= 0x10000000; /* Set SO bit in CR */
> + }
> + }
> +
> + if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) {
> + if (ti_flags & _TIF_RESTOREALL)
> + ret = _TIF_RESTOREALL;
> + else
> + regs->gpr[3] = r3;
> + clear_bits(_TIF_PERSYSCALL_MASK, ti_flagsp);
> + } else {
> + regs->gpr[3] = r3;
> + }
> +
> + if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
> + do_syscall_trace_leave(regs);
> + ret |= _TIF_RESTOREALL;
> + }
> +
> +again:
> + local_irq_disable();
> + ti_flags = READ_ONCE(*ti_flagsp);
> + while (unlikely(ti_flags & _TIF_USER_WORK_MASK)) {
> + local_irq_enable();
> + if (ti_flags & _TIF_NEED_RESCHED) {
> + schedule();
> + } else {
> + /*
> + * SIGPENDING must restore signal handler function
> + * argument GPRs, and some non-volatiles (e.g., r1).
> + * Restore all for now. This could be made lighter.
> + */
> + if (ti_flags & _TIF_SIGPENDING)
> + ret |= _TIF_RESTOREALL;
> + do_notify_resume(regs, ti_flags);
> + }
> + local_irq_disable();
> + ti_flags = READ_ONCE(*ti_flagsp);
> + }
> +
> + if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
> + unsigned long mathflags = MSR_FP;
> +
> + if (IS_ENABLED(CONFIG_ALTIVEC))
> + mathflags |= MSR_VEC;
> +
> + if ((regs->msr & mathflags) != mathflags)
> + restore_math(regs);
> + }
> +
> + /* This must be done with RI=1 because tracing may touch vmaps */
> + trace_hardirqs_on();
> +
> + /* This pattern matches prep_irq_for_idle */
> + __hard_EE_RI_disable();
> + if (unlikely(lazy_irq_pending())) {
> + __hard_RI_enable();
> + trace_hardirqs_off();
> + local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
> + local_irq_enable();
> + /* Took an interrupt which may have more exit work to do. */
> + goto again;
> + }
> + local_paca->irq_happened = 0;
> + irq_soft_mask_set(IRQS_ENABLED);
> +
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> + local_paca->tm_scratch = regs->msr;
> +#endif
> +
> + kuap_check_amr();
> +
> + account_cpu_user_exit();
> +
> + return ret;
> +}
> diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
> index 5b905a2f4e4d..d34276f3c495 100644
> --- a/arch/powerpc/kernel/systbl.S
> +++ b/arch/powerpc/kernel/systbl.S
> @@ -16,25 +16,22 @@
>
> #ifdef CONFIG_PPC64
> .p2align 3
> +#define __SYSCALL(nr, entry) .8byte entry
> +#else
> +#define __SYSCALL(nr, entry) .long entry
> #endif
>
> .globl sys_call_table
> sys_call_table:
> #ifdef CONFIG_PPC64
> -#define __SYSCALL(nr, entry) .8byte DOTSYM(entry)
> #include <asm/syscall_table_64.h>
> -#undef __SYSCALL
> #else
> -#define __SYSCALL(nr, entry) .long entry
> #include <asm/syscall_table_32.h>
> -#undef __SYSCALL
> #endif
>
> #ifdef CONFIG_COMPAT
> .globl compat_sys_call_table
> compat_sys_call_table:
> #define compat_sys_sigsuspend sys_sigsuspend
> -#define __SYSCALL(nr, entry) .8byte DOTSYM(entry)
> #include <asm/syscall_table_c32.h>
> -#undef __SYSCALL
> #endif
>
Christophe
More information about the Linuxppc-dev
mailing list