[PATCH v3 5/6] powerpc/64: Add support for out-of-line static calls

Christophe Leroy christophe.leroy at csgroup.eu
Thu Oct 6 06:38:28 AEDT 2022



Le 05/10/2022 à 07:32, Benjamin Gray a écrit :
> Implement static call support for 64 bit V2 ABI. This requires making
> sure the TOC is kept correct across kernel-module boundaries. As a
> secondary concern, it tries to use the local entry point of a target
> wherever possible. It does so by checking if both tramp & target are
> kernel code, and falls back to detecting the common global entry point
> patterns if modules are involved. Detecting the global entry point is
> also required for setting the local entry point as the trampoline
> target: if we cannot detect the local entry point, then we need to
> convservatively initialise r12 and use the global entry point.
> 
> The trampolines are marked with `.localentry NAME, 1` to make the
> linker save and restore the TOC on each call to the trampoline. This
> allows the trampoline to safely target functions with different TOC
> values.
> 
> However this directive also implies the TOC is not initialised on entry
> to the trampoline. The kernel TOC is easily found in the PACA, but not
> an arbitrary module TOC. Therefore the trampoline implementation depends
> on whether it's in the kernel or not. If in the kernel, we initialise
> the TOC using the PACA. If in a module, we have to initialise the TOC
> with zero context, so it's quite expensive.
> 
> Signed-off-by: Benjamin Gray <bgray at linux.ibm.com>

This looks good to me

Reviewed-by: Christophe Leroy <christophe.leroy at csgroup.eu>

However, thinking out loudly, I'm wondering, could we make things any 
simpler when CONFIG_MODULES is not selected, or is that a too much 
corner case on PPC64 ?

I'm asking because on embedded PPC32 it is current to not have 
CONFIG_MODULES set.

> ---
>   arch/powerpc/Kconfig                     |  14 ++-
>   arch/powerpc/include/asm/code-patching.h |   1 +
>   arch/powerpc/include/asm/static_call.h   |  80 +++++++++++++-
>   arch/powerpc/kernel/Makefile             |   3 +-
>   arch/powerpc/kernel/static_call.c        | 130 +++++++++++++++++++++--
>   5 files changed, 216 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 4c466acdc70d..962e36ec34ec 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -102,6 +102,18 @@ config GENERIC_HWEIGHT
>   	bool
>   	default y
>   
> +config TOOLCHAIN_SUPPORTS_LOCALENTRY1
> +	bool
> +	depends on PPC64_ELF_ABI_V2
> +	default y if LD_VERSION >= 23200 || LLD_VERSION >= 110000
> +	help
> +	  A section of the ELF symbol st_other field can be given the value 1
> +	  using the directive '.localentry NAME, 1' to mean the local and global
> +	  entry points are the same, and r2 should be treated as caller-saved.
> +
> +	  Older versions of Clang and binutils do not recognise this form of the
> +	  directive and will error if it is used.
> +
>   config PPC
>   	bool
>   	default y
> @@ -248,7 +260,7 @@ config PPC
>   	select HAVE_SOFTIRQ_ON_OWN_STACK
>   	select HAVE_STACKPROTECTOR		if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
>   	select HAVE_STACKPROTECTOR		if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
> -	select HAVE_STATIC_CALL			if PPC32
> +	select HAVE_STATIC_CALL			if PPC32 || (PPC64_ELF_ABI_V2 && TOOLCHAIN_SUPPORTS_LOCALENTRY1)
>   	select HAVE_SYSCALL_TRACEPOINTS
>   	select HAVE_VIRT_CPU_ACCOUNTING
>   	select HUGETLB_PAGE_SIZE_VARIABLE	if PPC_BOOK3S_64 && HUGETLB_PAGE
> diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
> index 170bfa848c7c..cb4629e55e57 100644
> --- a/arch/powerpc/include/asm/code-patching.h
> +++ b/arch/powerpc/include/asm/code-patching.h
> @@ -152,6 +152,7 @@ int translate_branch(ppc_inst_t *instr, const u32 *dest, const u32 *src);
>   bool is_conditional_branch(ppc_inst_t instr);
>   
>   #define OP_RT_RA_MASK	0xffff0000UL
> +#define OP_SI_MASK	0x0000ffffUL
>   #define LIS_R2		(PPC_RAW_LIS(_R2, 0))
>   #define ADDIS_R2_R12	(PPC_RAW_ADDIS(_R2, _R12, 0))
>   #define ADDI_R2_R2	(PPC_RAW_ADDI(_R2, _R2, 0))
> diff --git a/arch/powerpc/include/asm/static_call.h b/arch/powerpc/include/asm/static_call.h
> index de1018cc522b..3d6e82200cb7 100644
> --- a/arch/powerpc/include/asm/static_call.h
> +++ b/arch/powerpc/include/asm/static_call.h
> @@ -2,12 +2,75 @@
>   #ifndef _ASM_POWERPC_STATIC_CALL_H
>   #define _ASM_POWERPC_STATIC_CALL_H
>   
> +#ifdef CONFIG_PPC64_ELF_ABI_V2
> +
> +#ifdef MODULE
> +
> +#define __PPC_SCT(name, inst)					\
> +	asm(".pushsection .text, \"ax\"				\n"	\
> +	    ".align 6						\n"	\
> +	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
> +	    ".localentry " STATIC_CALL_TRAMP_STR(name) ", 1	\n"	\
> +	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
> +	    "	mflr	11					\n"	\
> +	    "	bcl	20, 31, $+4				\n"	\
> +	    "0:	mflr	12					\n"	\
> +	    "	mtlr	11					\n"	\
> +	    "	addi	12, 12, (" STATIC_CALL_TRAMP_STR(name) " - 0b)	\n"	\
> +	    "	addis 2, 12, (.TOC.-" STATIC_CALL_TRAMP_STR(name) ")@ha	\n"	\
> +	    "	addi 2, 2, (.TOC.-" STATIC_CALL_TRAMP_STR(name) ")@l	\n"	\
> +	    "	" inst "					\n"	\
> +	    "	ld	12, (2f - " STATIC_CALL_TRAMP_STR(name) ")(12)	\n"	\
> +	    "	mtctr	12					\n"	\
> +	    "	bctr						\n"	\
> +	    "1:	li	3, 0					\n"	\
> +	    "	blr						\n"	\
> +	    ".balign 8						\n"	\
> +	    "2:	.8byte 0					\n"	\
> +	    ".type " STATIC_CALL_TRAMP_STR(name) ", @function	\n"	\
> +	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
> +	    ".popsection					\n")
> +
> +#else /* KERNEL */
> +
> +#define __PPC_SCT(name, inst)					\
> +	asm(".pushsection .text, \"ax\"				\n"	\
> +	    ".align 5						\n"	\
> +	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
> +	    ".localentry " STATIC_CALL_TRAMP_STR(name) ", 1	\n"	\
> +	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
> +	    "	ld	2, 16(13)				\n"	\
> +	    "	" inst "					\n"	\
> +	    "	addis	12, 2, 2f at toc@ha			\n"	\
> +	    "	ld	12, 2f at toc@l(12)			\n"	\
> +	    "	mtctr	12					\n"	\
> +	    "	bctr						\n"	\
> +	    "1:	li	3, 0					\n"	\
> +	    "	blr						\n"	\
> +	    ".balign 8						\n"	\
> +	    "2:	.8byte 0					\n"	\
> +	    ".type " STATIC_CALL_TRAMP_STR(name) ", @function	\n"	\
> +	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
> +	    ".popsection					\n")
> +
> +#endif /* MODULE */
> +
> +#define PPC_SCT_INST_MODULE		28		/* Offset of instruction to update */
> +#define PPC_SCT_RET0_MODULE		44		/* Offset of label 1 */
> +#define PPC_SCT_DATA_MODULE		56		/* Offset of label 2 (aligned) */
> +
> +#define PPC_SCT_INST_KERNEL		4		/* Offset of instruction to update */
> +#define PPC_SCT_RET0_KERNEL		24		/* Offset of label 1 */
> +#define PPC_SCT_DATA_KERNEL		32		/* Offset of label 2 (aligned) */
> +
> +#elif defined(CONFIG_PPC32)
> +
>   #define __PPC_SCT(name, inst)					\
>   	asm(".pushsection .text, \"ax\"				\n"	\
>   	    ".align 5						\n"	\
>   	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
>   	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
> -	    inst "						\n"	\
> +	    "	" inst "					\n"	\
>   	    "	lis	12,2f at ha				\n"	\
>   	    "	lwz	12,2f at l(12)				\n"	\
>   	    "	mtctr	12					\n"	\
> @@ -19,11 +82,20 @@
>   	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
>   	    ".popsection					\n")
>   
> -#define PPC_SCT_RET0		20		/* Offset of label 1 */
> -#define PPC_SCT_DATA		28		/* Offset of label 2 */
> +#define PPC_SCT_INST_MODULE		0		/* Offset of instruction to update */
> +#define PPC_SCT_RET0_MODULE		20		/* Offset of label 1 */
> +#define PPC_SCT_DATA_MODULE		28		/* Offset of label 2 */
> +
> +#define PPC_SCT_INST_KERNEL		PPC_SCT_INST_MODULE
> +#define PPC_SCT_RET0_KERNEL		PPC_SCT_RET0_MODULE
> +#define PPC_SCT_DATA_KERNEL		PPC_SCT_DATA_MODULE
> +
> +#else /* !CONFIG_PPC64_ELF_ABI_V2 && !CONFIG_PPC32 */
> +#error "Unsupported ABI"
> +#endif /* CONFIG_PPC64_ELF_ABI_V2 */
>   
>   #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)	__PPC_SCT(name, "b " #func)
>   #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)	__PPC_SCT(name, "blr")
> -#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)	__PPC_SCT(name, "b .+20")
> +#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name)	__PPC_SCT(name, "b 1f")
>   
>   #endif /* _ASM_POWERPC_STATIC_CALL_H */
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 06d2d1f78f71..a30d0d0f5499 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -128,8 +128,9 @@ extra-y				+= vmlinux.lds
>   
>   obj-$(CONFIG_RELOCATABLE)	+= reloc_$(BITS).o
>   
> -obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o early_32.o static_call.o
> +obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o early_32.o
>   obj-$(CONFIG_PPC64)		+= dma-iommu.o iommu.o
> +obj-$(CONFIG_HAVE_STATIC_CALL)	+= static_call.o
>   obj-$(CONFIG_KGDB)		+= kgdb.o
>   obj-$(CONFIG_BOOTX_TEXT)	+= btext.o
>   obj-$(CONFIG_SMP)		+= smp.o
> diff --git a/arch/powerpc/kernel/static_call.c b/arch/powerpc/kernel/static_call.c
> index 863a7aa24650..9211b2e189bb 100644
> --- a/arch/powerpc/kernel/static_call.c
> +++ b/arch/powerpc/kernel/static_call.c
> @@ -1,33 +1,151 @@
>   // SPDX-License-Identifier: GPL-2.0
> +#include <linux/bitops.h>
>   #include <linux/memory.h>
>   #include <linux/static_call.h>
>   
>   #include <asm/code-patching.h>
>   
> +static long sign_extend_long(unsigned long value, int index)
> +{
> +	if (sizeof(long) == 8)
> +		return sign_extend64(value, index);
> +	else
> +		return sign_extend32(value, index);
> +}
> +
> +static void *ppc_function_toc(u32 *func)
> +{
> +	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) {
> +		/* There are two common global entry sequences we handle below
> +		 *
> +		 * 1. addis r2, r12, SI1
> +		 *    addi r2, SI2
> +		 *
> +		 * 2. lis r2, SI1
> +		 *    addi r2, SI2
> +		 *
> +		 * Where r12 contains the global entry point address (it is otherwise
> +		 * uninitialised, so doesn't matter what value we use if this is not
> +		 * a separate global entry point).
> +		 *
> +		 * Here we simulate running the given sequence and return the result it
> +		 * would calculate. If the sequence is not recognised we return NULL.
> +		 */
> +		u32 insn1 = *func;
> +		u32 insn2 = *(func + 1);
> +		unsigned long op_regs1 = insn1 & OP_RT_RA_MASK;
> +		unsigned long op_regs2 = insn2 & OP_RT_RA_MASK;
> +		unsigned long si1 = insn1 & OP_SI_MASK;
> +		unsigned long si2 = insn2 & OP_SI_MASK;
> +		unsigned long imm1 = sign_extend_long(si1 << 16, 31);
> +		unsigned long imm2 = sign_extend_long(si2, 15);
> +		unsigned long addr = 0;
> +
> +		/* Simulate the first instruction */
> +		if (op_regs1 == ADDIS_R2_R12)
> +			addr += (unsigned long)func + imm1;
> +		else if (op_regs1 == LIS_R2)
> +			addr += imm1;
> +		else
> +			return NULL;
> +
> +		/* Simulate the second instruction */
> +		if (op_regs2 == ADDI_R2_R2)
> +			addr += imm2;
> +		else
> +			return NULL;
> +
> +		return (void *)addr;
> +	}
> +
> +	return NULL;
> +}
> +
> +static bool shares_toc(void *func1, void *func2)
> +{
> +	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) {
> +		void *func1_toc;
> +		void *func2_toc;
> +
> +		if (func1 == NULL || func2 == NULL)
> +			return false;
> +
> +		/* Assume the kernel only uses a single TOC */
> +		if (core_kernel_text((unsigned long)func1) &&
> +		    core_kernel_text((unsigned long)func2))
> +			return true;
> +
> +		/* Fall back to calculating the TOC from common patterns
> +		 * if modules are involved
> +		 */
> +		func1_toc = ppc_function_toc(func1);
> +		func2_toc = ppc_function_toc(func2);
> +		return func1_toc != NULL && func2_toc != NULL && func1_toc == func2_toc;
> +	}
> +
> +	return true;
> +}
> +
> +static void *get_inst_addr(void *tramp)
> +{
> +	return tramp + (core_kernel_text((unsigned long)tramp)
> +				? PPC_SCT_INST_KERNEL
> +				: PPC_SCT_INST_MODULE);
> +}
> +
> +static void *get_ret0_addr(void *tramp)
> +{
> +	return tramp + (core_kernel_text((unsigned long)tramp)
> +				? PPC_SCT_RET0_KERNEL
> +				: PPC_SCT_RET0_MODULE);
> +}
> +
> +static void *get_data_addr(void *tramp)
> +{
> +	return tramp + (core_kernel_text((unsigned long) tramp)
> +				? PPC_SCT_DATA_KERNEL
> +				: PPC_SCT_DATA_MODULE);
> +}
> +
>   void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
>   {
>   	int err;
>   	bool is_ret0 = (func == __static_call_return0);
> -	unsigned long target = (unsigned long)(is_ret0 ? tramp + PPC_SCT_RET0 : func);
> -	bool is_short = is_offset_in_branch_range((long)target - (long)tramp);
> +	bool is_short;
> +	void *target = is_ret0 ? get_ret0_addr(tramp) : func;
> +	void *tramp_inst = get_inst_addr(tramp);
>   
>   	if (!tramp)
>   		return;
>   
> +	if (is_ret0)
> +		is_short = true;
> +	else if (shares_toc(tramp, target))
> +		is_short = is_offset_in_branch_range(
> +			(long)ppc_function_entry(target) - (long)tramp_inst);
> +	else
> +		/* Combine out-of-range with not sharing a TOC. Though it's possible
> +		 * an out-of-range target shares a TOC, handling this separately
> +		 * complicates the trampoline. It's simpler to always use the global
> +		 * entry point in this case.
> +		 */
> +		is_short = false;
> +
>   	mutex_lock(&text_mutex);
>   
>   	if (func && !is_short) {
> -		err = patch_instruction(tramp + PPC_SCT_DATA, ppc_inst(target));
> +		err = patch_ulong(get_data_addr(tramp), (unsigned long)target);
>   		if (err)
>   			goto out;
>   	}
>   
>   	if (!func)
> -		err = patch_instruction(tramp, ppc_inst(PPC_RAW_BLR()));
> +		err = patch_instruction(tramp_inst, ppc_inst(PPC_RAW_BLR()));
>   	else if (is_short)
> -		err = patch_branch(tramp, target, 0);
> +		err = patch_branch(tramp_inst, ppc_function_entry(target), 0);
>   	else
> -		err = patch_instruction(tramp, ppc_inst(PPC_RAW_NOP()));
> +		err = patch_instruction(tramp_inst, ppc_inst(PPC_RAW_NOP()));
> +
>   out:
>   	mutex_unlock(&text_mutex);
>   


More information about the Linuxppc-dev mailing list