[RESEND 2/3] powerpc/memcpy: Add memcpy_mcsafe for pmem

Thu Apr 5 09:57:55 AEST 2018

On Thu,  5 Apr 2018 09:19:42 +1000
Balbir Singh <bsingharora at gmail.com> wrote:

> The pmem infrastructure uses memcpy_mcsafe in the pmem
> layer so as to convert machine check excpetions into
> a return value on failure in case a machine check
> exception is encoutered during the memcpy.
> 
> This patch largely borrows from the copyuser_power7
> logic and does not add the VMX optimizations, largely
> to keep the patch simple. If needed those optimizations
> can be folded in.

So memcpy_mcsafe doesn't return number of bytes copied?
Huh, well that makes it simple.

Would be nice if there was an easy way to share this with
the regular memcpy code... that's probably for another day
though, probably better to let this settle down first.

I didn't review exact instructions, but the approach looks
right to me.

Acked-by: Nicholas Piggin <npiggin at gmail.com>

> 
> Signed-off-by: Balbir Singh <bsingharora at gmail.com>
> ---
>  arch/powerpc/include/asm/string.h   |   2 +
>  arch/powerpc/lib/Makefile           |   2 +-
>  arch/powerpc/lib/memcpy_mcsafe_64.S | 212 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 215 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S
> 
> diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
> index 9b8cedf618f4..b7e872a64726 100644
> --- a/arch/powerpc/include/asm/string.h
> +++ b/arch/powerpc/include/asm/string.h
> @@ -30,7 +30,9 @@ extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
>  #ifdef CONFIG_PPC64
>  #define __HAVE_ARCH_MEMSET32
>  #define __HAVE_ARCH_MEMSET64
> +#define __HAVE_ARCH_MEMCPY_MCSAFE
>  
> +extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
>  extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
>  extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
>  extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
> diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
> index 3c29c9009bbf..048afee9f518 100644
> --- a/arch/powerpc/lib/Makefile
> +++ b/arch/powerpc/lib/Makefile
> @@ -24,7 +24,7 @@ endif
>  
>  obj64-y	+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
>  	   copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \
> -	   memcpy_64.o memcmp_64.o pmem.o
> +	   memcpy_64.o memcmp_64.o pmem.o memcpy_mcsafe_64.o
>  
>  obj64-$(CONFIG_SMP)	+= locks.o
>  obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
> diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S b/arch/powerpc/lib/memcpy_mcsafe_64.S
> new file mode 100644
> index 000000000000..e7eaa9b6cded
> --- /dev/null
> +++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
> @@ -0,0 +1,212 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) IBM Corporation, 2011
> + * Derived from copyuser_power7.s by Anton Blanchard <anton at au.ibm.com>
> + * Author - Balbir Singh <bsingharora at gmail.com>
> + */
> +#include <asm/ppc_asm.h>
> +#include <asm/errno.h>
> +
> +	.macro err1
> +100:
> +	EX_TABLE(100b,.Ldo_err1)
> +	.endm
> +
> +	.macro err2
> +200:
> +	EX_TABLE(200b,.Ldo_err2)
> +	.endm
> +
> +.Ldo_err2:
> +	ld	r22,STK_REG(R22)(r1)
> +	ld	r21,STK_REG(R21)(r1)
> +	ld	r20,STK_REG(R20)(r1)
> +	ld	r19,STK_REG(R19)(r1)
> +	ld	r18,STK_REG(R18)(r1)
> +	ld	r17,STK_REG(R17)(r1)
> +	ld	r16,STK_REG(R16)(r1)
> +	ld	r15,STK_REG(R15)(r1)
> +	ld	r14,STK_REG(R14)(r1)
> +	addi	r1,r1,STACKFRAMESIZE
> +.Ldo_err1:
> +	li	r3,-EFAULT
> +	blr
> +
> +
> +_GLOBAL(memcpy_mcsafe)
> +	cmpldi	r5,16
> +	blt	.Lshort_copy
> +
> +.Lcopy:
> +	/* Get the source 8B aligned */
> +	neg	r6,r4
> +	mtocrf	0x01,r6
> +	clrldi	r6,r6,(64-3)
> +
> +	bf	cr7*4+3,1f
> +err1;	lbz	r0,0(r4)
> +	addi	r4,r4,1
> +err1;	stb	r0,0(r3)
> +	addi	r3,r3,1
> +
> +1:	bf	cr7*4+2,2f
> +err1;	lhz	r0,0(r4)
> +	addi	r4,r4,2
> +err1;	sth	r0,0(r3)
> +	addi	r3,r3,2
> +
> +2:	bf	cr7*4+1,3f
> +err1;	lwz	r0,0(r4)
> +	addi	r4,r4,4
> +err1;	stw	r0,0(r3)
> +	addi	r3,r3,4
> +
> +3:	sub	r5,r5,r6
> +	cmpldi	r5,128
> +	blt	5f
> +
> +	mflr	r0
> +	stdu	r1,-STACKFRAMESIZE(r1)
> +	std	r14,STK_REG(R14)(r1)
> +	std	r15,STK_REG(R15)(r1)
> +	std	r16,STK_REG(R16)(r1)
> +	std	r17,STK_REG(R17)(r1)
> +	std	r18,STK_REG(R18)(r1)
> +	std	r19,STK_REG(R19)(r1)
> +	std	r20,STK_REG(R20)(r1)
> +	std	r21,STK_REG(R21)(r1)
> +	std	r22,STK_REG(R22)(r1)
> +	std	r0,STACKFRAMESIZE+16(r1)
> +
> +	srdi	r6,r5,7
> +	mtctr	r6
> +
> +	/* Now do cacheline (128B) sized loads and stores. */
> +	.align	5
> +4:
> +err2;	ld	r0,0(r4)
> +err2;	ld	r6,8(r4)
> +err2;	ld	r7,16(r4)
> +err2;	ld	r8,24(r4)
> +err2;	ld	r9,32(r4)
> +err2;	ld	r10,40(r4)
> +err2;	ld	r11,48(r4)
> +err2;	ld	r12,56(r4)
> +err2;	ld	r14,64(r4)
> +err2;	ld	r15,72(r4)
> +err2;	ld	r16,80(r4)
> +err2;	ld	r17,88(r4)
> +err2;	ld	r18,96(r4)
> +err2;	ld	r19,104(r4)
> +err2;	ld	r20,112(r4)
> +err2;	ld	r21,120(r4)
> +	addi	r4,r4,128
> +err2;	std	r0,0(r3)
> +err2;	std	r6,8(r3)
> +err2;	std	r7,16(r3)
> +err2;	std	r8,24(r3)
> +err2;	std	r9,32(r3)
> +err2;	std	r10,40(r3)
> +err2;	std	r11,48(r3)
> +err2;	std	r12,56(r3)
> +err2;	std	r14,64(r3)
> +err2;	std	r15,72(r3)
> +err2;	std	r16,80(r3)
> +err2;	std	r17,88(r3)
> +err2;	std	r18,96(r3)
> +err2;	std	r19,104(r3)
> +err2;	std	r20,112(r3)
> +err2;	std	r21,120(r3)
> +	addi	r3,r3,128
> +	bdnz	4b
> +
> +	clrldi	r5,r5,(64-7)
> +
> +	ld	r14,STK_REG(R14)(r1)
> +	ld	r15,STK_REG(R15)(r1)
> +	ld	r16,STK_REG(R16)(r1)
> +	ld	r17,STK_REG(R17)(r1)
> +	ld	r18,STK_REG(R18)(r1)
> +	ld	r19,STK_REG(R19)(r1)
> +	ld	r20,STK_REG(R20)(r1)
> +	ld	r21,STK_REG(R21)(r1)
> +	ld	r22,STK_REG(R22)(r1)
> +	addi	r1,r1,STACKFRAMESIZE
> +
> +	/* Up to 127B to go */
> +5:	srdi	r6,r5,4
> +	mtocrf	0x01,r6
> +
> +6:	bf	cr7*4+1,7f
> +err1;	ld	r0,0(r4)
> +err1;	ld	r6,8(r4)
> +err1;	ld	r7,16(r4)
> +err1;	ld	r8,24(r4)
> +err1;	ld	r9,32(r4)
> +err1;	ld	r10,40(r4)
> +err1;	ld	r11,48(r4)
> +err1;	ld	r12,56(r4)
> +	addi	r4,r4,64
> +err1;	std	r0,0(r3)
> +err1;	std	r6,8(r3)
> +err1;	std	r7,16(r3)
> +err1;	std	r8,24(r3)
> +err1;	std	r9,32(r3)
> +err1;	std	r10,40(r3)
> +err1;	std	r11,48(r3)
> +err1;	std	r12,56(r3)
> +	addi	r3,r3,64
> +
> +	/* Up to 63B to go */
> +7:	bf	cr7*4+2,8f
> +err1;	ld	r0,0(r4)
> +err1;	ld	r6,8(r4)
> +err1;	ld	r7,16(r4)
> +err1;	ld	r8,24(r4)
> +	addi	r4,r4,32
> +err1;	std	r0,0(r3)
> +err1;	std	r6,8(r3)
> +err1;	std	r7,16(r3)
> +err1;	std	r8,24(r3)
> +	addi	r3,r3,32
> +
> +	/* Up to 31B to go */
> +8:	bf	cr7*4+3,9f
> +err1;	ld	r0,0(r4)
> +err1;	ld	r6,8(r4)
> +	addi	r4,r4,16
> +err1;	std	r0,0(r3)
> +err1;	std	r6,8(r3)
> +	addi	r3,r3,16
> +
> +9:	clrldi	r5,r5,(64-4)
> +
> +	/* Up to 15B to go */
> +.Lshort_copy:
> +	mtocrf	0x01,r5
> +	bf	cr7*4+0,12f
> +err1;	lwz	r0,0(r4)	/* Less chance of a reject with word ops */
> +err1;	lwz	r6,4(r4)
> +	addi	r4,r4,8
> +err1;	stw	r0,0(r3)
> +err1;	stw	r6,4(r3)
> +	addi	r3,r3,8
> +
> +12:	bf	cr7*4+1,13f
> +err1;	lwz	r0,0(r4)
> +	addi	r4,r4,4
> +err1;	stw	r0,0(r3)
> +	addi	r3,r3,4
> +
> +13:	bf	cr7*4+2,14f
> +err1;	lhz	r0,0(r4)
> +	addi	r4,r4,2
> +err1;	sth	r0,0(r3)
> +	addi	r3,r3,2
> +
> +14:	bf	cr7*4+3,15f
> +err1;	lbz	r0,0(r4)
> +err1;	stb	r0,0(r3)
> +
> +15:	li	r3,0
> +	blr