[PATCH v5 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

Thu May 24 17:44:33 AEST 2018

Hi Simon,

wei.guo.simon at gmail.com writes:
> From: Simon Guo <wei.guo.simon at gmail.com>
>
> This patch add VMX primitives to do memcmp() in case the compare size
> exceeds 4K bytes. KSM feature can benefit from this.

You say "exceeds 4K" here.

> diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
> index f20e883..6303bbf 100644
> --- a/arch/powerpc/lib/memcmp_64.S
> +++ b/arch/powerpc/lib/memcmp_64.S
> @@ -27,12 +27,73 @@
>  #define LH	lhbrx
>  #define LW	lwbrx
>  #define LD	ldbrx
> +#define LVS	lvsr
> +#define VPERM(_VRT,_VRA,_VRB,_VRC) \
> +	vperm _VRT,_VRB,_VRA,_VRC
>  #else
>  #define LH	lhzx
>  #define LW	lwzx
>  #define LD	ldx
> +#define LVS	lvsl
> +#define VPERM(_VRT,_VRA,_VRB,_VRC) \
> +	vperm _VRT,_VRA,_VRB,_VRC
>  #endif
>  
> +#define VMX_OPS_THRES 4096

THRES == 4096

BTW, can we call it VMX_THRESH ?

> +#define ENTER_VMX_OPS	\
> +	mflr    r0;	\
> +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
> +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
> +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
> +	std     r0,16(r1); \
> +	stdu    r1,-STACKFRAMESIZE(r1); \
> +	bl      enter_vmx_ops; \
> +	cmpwi   cr1,r3,0; \
> +	ld      r0,STACKFRAMESIZE+16(r1); \
> +	ld      r3,STK_REG(R31)(r1); \
> +	ld      r4,STK_REG(R30)(r1); \
> +	ld      r5,STK_REG(R29)(r1); \
> +	addi	r1,r1,STACKFRAMESIZE; \
> +	mtlr    r0
> +
> +#define EXIT_VMX_OPS \
> +	mflr    r0; \
> +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
> +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
> +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
> +	std     r0,16(r1); \
> +	stdu    r1,-STACKFRAMESIZE(r1); \
> +	bl      exit_vmx_ops; \
> +	ld      r0,STACKFRAMESIZE+16(r1); \
> +	ld      r3,STK_REG(R31)(r1); \
> +	ld      r4,STK_REG(R30)(r1); \
> +	ld      r5,STK_REG(R29)(r1); \
> +	addi	r1,r1,STACKFRAMESIZE; \
> +	mtlr    r0
> +
> +/*
> + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
> + * 16 bytes boundary and permute the result with the 1st 16 bytes.
> +
> + *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
> + *    ^                                  ^                                 ^
> + * 0xbbbb10                          0xbbbb20                          0xbbb30
> + *                                 ^
> + *                                _vaddr
> + *
> + *
> + * _vmask is the mask generated by LVS
> + * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
> + *   for example: 0xyyyyyyyyyyyyy012 for big endian
> + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
> + *   for example: 0x3456789abcdefzzz for big endian
> + * The permute result is saved in _v_res.
> + *   for example: 0x0123456789abcdef for big endian.
> + */
> +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
> +        lvx     _v2nd_qw,_vaddr,off16; \
> +        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
> +
>  /*
>   * There are 2 categories for memcmp:
>   * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
> @@ -174,6 +235,13 @@ _GLOBAL(memcmp)
>  	blr
>  
>  .Llong:
> +#ifdef CONFIG_ALTIVEC
> +	/* Try to use vmx loop if length is larger than 4K */
> +	cmpldi  cr6,r5,VMX_OPS_THRES
> +	bge	cr6,.Lsameoffset_vmx_cmp

Here we compare the length to 4K and if it's greater *or equal* then we
go to the VMX case. Or am I reading it backward?

So we should say "if the size is 4K or more we do VMX" shouldn't we?

cheers