[PATCH 4/4] powerpc/64: system call implement the bulk of the logic in C

Christophe Leroy christophe.leroy at c-s.fr
Tue Aug 27 16:46:29 AEST 2019



Le 27/08/2019 à 05:30, Nicholas Piggin a écrit :
> System call entry and particularly exit code is beyond the limit of what
> is reasonable to implement in asm.
> 
> This conversion moves all conditional branches out of the asm code,
> except a relatively simple test to see whether all GPRs should be
> restored at exit time.
> 
> Null syscall test is about 5% faster after this patch, because the exit
> work is handled under local_irq_disable, and the hard mask and pending
> interrupt replay is handled after that, which avoids games with MSR.

Interesting optimisation.

> 
> The asm instruction scheduling has not really been analysed and
> optimised yet, as there are higher level features and improvements to
> add first.
> 
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
> ---
>   arch/powerpc/include/asm/asm-prototypes.h |  11 -
>   arch/powerpc/include/asm/ptrace.h         |   3 +
>   arch/powerpc/include/asm/signal.h         |   2 +
>   arch/powerpc/include/asm/switch_to.h      |   4 +
>   arch/powerpc/include/asm/time.h           |   3 +
>   arch/powerpc/kernel/Makefile              |   3 +-
>   arch/powerpc/kernel/entry_64.S            | 343 ++++------------------
>   arch/powerpc/kernel/process.c             |   6 +-
>   arch/powerpc/kernel/signal.h              |   2 -
>   arch/powerpc/kernel/syscall_64.c          | 202 +++++++++++++
>   10 files changed, 271 insertions(+), 308 deletions(-)
>   create mode 100644 arch/powerpc/kernel/syscall_64.c
> 

[...]

> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index 24621e7e5033..f444525da9ce 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -1609,7 +1609,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
>   	extern void ret_from_kernel_thread(void);
>   	void (*f)(void);
>   	unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE;
> -	struct thread_info *ti = task_thread_info(p);
>   
>   	klp_init_thread_info(p);
>   
> @@ -1617,6 +1616,8 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
>   	sp -= sizeof(struct pt_regs);
>   	childregs = (struct pt_regs *) sp;
>   	if (unlikely(p->flags & PF_KTHREAD)) {
> +		struct thread_info *ti = task_thread_info(p);
> +
>   		/* kernel thread */
>   		memset(childregs, 0, sizeof(struct pt_regs));
>   		childregs->gpr[1] = sp + sizeof(struct pt_regs);
> @@ -1634,12 +1635,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp,
>   	} else {
>   		/* user thread */
>   		struct pt_regs *regs = current_pt_regs();
> +
>   		CHECK_FULL_REGS(regs);
>   		*childregs = *regs;
>   		if (usp)
>   			childregs->gpr[1] = usp;
>   		p->thread.regs = childregs;
> -		childregs->gpr[3] = 0;  /* Result from fork() */
> +		/* ret_from_fork sets fork() result to 0 */

Does PPC32 ret_from_fork() do it as well ?

>   		if (clone_flags & CLONE_SETTLS) {
>   #ifdef CONFIG_PPC64
>   			if (!is_32bit_task())
> diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
> index 800433685888..d396efca4068 100644
> --- a/arch/powerpc/kernel/signal.h
> +++ b/arch/powerpc/kernel/signal.h
> @@ -10,8 +10,6 @@
>   #ifndef _POWERPC_ARCH_SIGNAL_H
>   #define _POWERPC_ARCH_SIGNAL_H
>   
> -extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
> -
>   extern void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
>   				  size_t frame_size, int is_32);
>   
> diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
> new file mode 100644
> index 000000000000..98ed970796d5
> --- /dev/null
> +++ b/arch/powerpc/kernel/syscall_64.c
> @@ -0,0 +1,202 @@
> +#include <linux/err.h>
> +#include <asm/hw_irq.h>
> +#include <asm/paca.h>
> +#include <asm/ptrace.h>
> +#include <asm/reg.h>
> +#include <asm/signal.h>
> +#include <asm/switch_to.h>
> +#include <asm/syscall.h>
> +#include <asm/time.h>
> +
> +extern void __noreturn tabort_syscall(void);
> +
> +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> +static inline void account_cpu_user_entry(void)
> +{
> +	unsigned long tb = mftb();
> +
> +	local_paca->accounting.utime += (tb - local_paca->accounting.starttime_user);
> +	local_paca->accounting.starttime = tb;
> +}
> +static inline void account_cpu_user_exit(void)
> +{
> +	unsigned long tb = mftb();
> +
> +	local_paca->accounting.stime += (tb - local_paca->accounting.starttime);
> +	local_paca->accounting.starttime_user = tb;
> +}
> +#else
> +static inline void account_cpu_user_entry(void)
> +{
> +}
> +static inline void account_cpu_user_exit(void)
> +{
> +}
> +#endif

I think this block should go in arch/powerpc/include/asm/cputime.h, we 
should limit #ifdefs as much as possible in C files.

And use get_accounting() instead of local_paca->accounting in order to 
be usable on PPC32 as well



> +
> +typedef long (*syscall_fn)(long, long, long, long, long, long);
> +
> +long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, unsigned long r0, struct pt_regs *regs)
> +{
> +	unsigned long ti_flags;
> +	syscall_fn f;
> +
> +	BUG_ON(!(regs->msr & MSR_PR));
> +
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> +	if (unlikely(regs->msr & MSR_TS_T))
> +		tabort_syscall();
> +#endif

MSR_TS_T and tabort_syscall() are declared regardless of 
CONFIG_PPC_TRANSACTIONAL_MEM so IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) 
should be used instead of #ifdef

> +
> +	account_cpu_user_entry();
> +
> +#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR)
> +	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
> +		struct lppaca *lp = get_lppaca();
> +
> +		if (unlikely(local_paca->dtl_ridx != lp->dtl_idx))
> +			accumulate_stolen_time();
> +	}
> +#endif

Same here, I think everything is available so IS_ENABLED() should be 
used instead of #if

> +
> +#ifdef CONFIG_PPC_KUAP_DEBUG
> +	if (mmu_has_feature(MMU_FTR_RADIX_KUAP))
> +		WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED);
> +#endif

This should go in a helper in one of the kup/kuap header files.

> +
> +	/*
> +	 * A syscall should always be called with interrupts enabled
> +	 * so we just unconditionally hard-enable here. When some kind
> +	 * of irq tracing is used, we additionally check that condition
> +	 * is correct
> +	 */
> +#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)
> +	WARN_ON(irq_soft_mask_return() != IRQS_ENABLED);
> +	WARN_ON(local_paca->irq_happened);
> +#endif

Can we use IS_ENABLED() here as well ?

> +
> +	__hard_irq_enable();
> +
> +	/*
> +	 * We do need to set SOFTE in the stack frame or the return
> +	 * from interrupt will be painful
> +	 */
> +	regs->softe = IRQS_ENABLED;
> +
> +	ti_flags = current_thread_info()->flags;
> +	if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
> +		/*
> +		 * We use the return value of do_syscall_trace_enter() as the
> +		 * syscall number. If the syscall was rejected for any reason
> +		 * do_syscall_trace_enter() returns an invalid syscall number
> +		 * and the test below against NR_syscalls will fail.
> +		 */
> +		r0 = do_syscall_trace_enter(regs);
> +	}
> +
> +	if (unlikely(r0 >= NR_syscalls))
> +		return -ENOSYS;
> +
> +	/* May be faster to do array_index_nospec? */
> +	barrier_nospec();
> +
> +	if (unlikely(ti_flags & _TIF_32BIT)) {
> +		f = (void *)compat_sys_call_table[r0];
> +
> +		r3 &= 0x00000000ffffffffULL;
> +		r4 &= 0x00000000ffffffffULL;
> +		r5 &= 0x00000000ffffffffULL;
> +		r6 &= 0x00000000ffffffffULL;
> +		r7 &= 0x00000000ffffffffULL;
> +		r8 &= 0x00000000ffffffffULL;
> +
> +	} else {
> +		f = (void *)sys_call_table[r0];
> +	}
> +
> +	return f(r3, r4, r5, r6, r7, r8);
> +}
> +
> +unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs *regs)
> +{
> +	unsigned long *ti_flagsp = &current_thread_info()->flags;
> +	unsigned long ti_flags;
> +	unsigned long ret = 0;
> +
> +	regs->result = r3;
> +
> +	/* Check whether the syscall is issued inside a restartable sequence */
> +	rseq_syscall(regs);
> +
> +	ti_flags = *ti_flagsp;
> +	if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE))
> +		do_syscall_trace_leave(regs);
> +
> +	if (unlikely(r3 >= (unsigned long)-MAX_ERRNO)) {
> +		if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
> +			r3 = -r3;
> +			regs->ccr |= 0x10000000; /* Set SO bit in CR */
> +		}
> +	}
> +
> +	if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) {
> +		if (ti_flags & _TIF_RESTOREALL)
> +			ret = _TIF_RESTOREALL;
> +		else
> +			regs->gpr[3] = r3;
> +		clear_bits(_TIF_PERSYSCALL_MASK, ti_flagsp);
> +	} else {
> +		regs->gpr[3] = r3;
> +	}
> +
> +again:
> +	local_irq_disable();
> +	ti_flags = READ_ONCE(*ti_flagsp);
> +	while (unlikely(ti_flags & _TIF_USER_WORK_MASK)) {
> +		local_irq_enable();
> +		if (ti_flags & _TIF_NEED_RESCHED) {
> +			schedule();
> +		} else {
> +			/*
> +			 * SIGPENDING must restore signal handler function
> +			 * argument GPRs, and some non-volatiles (e.g., r1).
> +			 * Restore all for now. This could be made lighter.
> +			 */
> +			if (ti_flags & _TIF_SIGPENDING)
> +				ret |= _TIF_RESTOREALL;
> +			do_notify_resume(regs, ti_flags);
> +		}
> +		local_irq_disable();
> +		ti_flags = READ_ONCE(*ti_flagsp);
> +	}
> +
> +#ifdef CONFIG_ALTIVEC
> +	if ((regs->msr & (MSR_FP|MSR_VEC)) != (MSR_FP|MSR_VEC))
> +#else
> +	if ((regs->msr & MSR_FP) != MSR_FP)
> +#endif

Use 'if (IS_ENABLED(CONFIG_ALTIVEC)) / else' instead of an 
#ifdef/#else/#endif

> +		restore_math(regs);
> +
> +	/* This pattern matches prep_irq_for_idle */
> +	__mtmsrd(0, 1);	/* Disable MSR_EE and MSR_RI */
> +	if (unlikely(lazy_irq_pending())) {
> +		__mtmsrd(MSR_RI, 1);
> +		local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
> +		local_irq_enable();
> +		/* Took an interrupt which may have more exit work to do. */
> +		goto again;
> +	}
> +	trace_hardirqs_on();
> +	local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
> +	irq_soft_mask_set(IRQS_ENABLED);
> +
> +	account_cpu_user_exit();
> +
> +#ifdef CONFIG_PPC_KUAP_DEBUG
> +	if (mmu_has_feature(MMU_FTR_RADIX_KUAP))
> +		WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED);
> +#endif

Same, define a helper in the kuap header files to avoid the #ifdefs and 
platform specif stuff here.

> +
> +	return ret;
> +}
> +
> 

Christophe


More information about the Linuxppc-dev mailing list