[PATCH v1 2/3] powerpc: enhance memcmp() with VMX instruction for long bytes comparision

wei.guo.simon at gmail.com wei.guo.simon at gmail.com
Tue Sep 19 20:03:58 AEST 2017


From: Simon Guo <wei.guo.simon at gmail.com>

This patch add VMX primitives to do memcmp() in case the compare size
exceeds 4K bytes.

Test result with following test program:
------
tools/testing/selftests/powerpc/stringloops# cat memcmp.c

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
	char *s1;
	char *s2;
	unsigned long i;

	s1 = memalign(128, SIZE);
	if (!s1) {
		perror("memalign");
		exit(1);
	}

	s2 = memalign(128, SIZE);
	if (!s2) {
		perror("memalign");
		exit(1);
	}

	for (i = 0; i < SIZE; i++)  {
		s1[i] = i & 0xff;
		s2[i] = i & 0xff;
	}
	for (i = 0; i < ITERATIONS; i++)
		test_memcmp(s1, s2, SIZE);

	return 0;
}

int main(void)
{
	return test_harness(testcase, "memcmp");
}

------
Without VMX patch:
       5.085776331 seconds time elapsed                                          ( +-  0.28% )
With VMX patch:
       4.584002052 seconds time elapsed                                          ( +-  0.02% )

		There is ~10% improvement.

However I am not aware whether there is use case in kernel for memcmp on
large size yet.

Signed-off-by: Simon Guo <wei.guo.simon at gmail.com>
---
 arch/powerpc/include/asm/asm-prototypes.h |  2 +-
 arch/powerpc/lib/copypage_power7.S        |  2 +-
 arch/powerpc/lib/memcmp_64.S              | 79 +++++++++++++++++++++++++++++++
 arch/powerpc/lib/memcpy_power7.S          |  2 +-
 arch/powerpc/lib/vmx-helper.c             |  2 +-
 5 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index 7330150..e6530d8 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,7 +49,7 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
+int enter_vmx_ops(void);
 void * exit_vmx_copy(void *dest);
 
 /* Traps */
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index ca5fc8f..9e7729e 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -60,7 +60,7 @@ _GLOBAL(copypage_power7)
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 6dbafdb..b86a1d3 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -153,6 +153,13 @@ _GLOBAL(memcmp)
 	blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+	/* Try to use vmx loop if length is larger than 4K */
+	cmpldi  cr6,r5,4096
+	bgt	cr6,.Lvmx_cmp
+
+.Llong_novmx_cmp:
+#endif
 	li	off8,8
 	li	off16,16
 	li	off24,24
@@ -310,4 +317,76 @@ _GLOBAL(memcmp)
 8:
 	blr
 
+#ifdef CONFIG_ALTIVEC
+.Lvmx_cmp:
+	mflr    r0
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
+	std     r0,16(r1)
+	stdu    r1,-STACKFRAMESIZE(r1)
+	bl      enter_vmx_ops
+	cmpwi   cr1,r3,0
+	ld      r0,STACKFRAMESIZE+16(r1)
+	ld      r3,STK_REG(R31)(r1)
+	ld      r4,STK_REG(R30)(r1)
+	ld      r5,STK_REG(R29)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+	mtlr    r0
+	beq     cr1,.Llong_novmx_cmp
+
+3:
+	/* Enter with src/dst address 8 bytes aligned, and len is
+	 * no less than 4KB. Need to align with 16 bytes further.
+	 */
+	andi.	rA,r3,8
+	beq	4f
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+
+	addi	r3,r3,8
+	addi	r4,r4,8
+
+4:
+	/* compare 32 bytes for each loop */
+	srdi	r0,r5,5
+	mtctr	r0
+	andi.	r5,r5,31
+	li	off16,16
+5:
+	lvx 	v0,0,r3
+	lvx 	v1,0,r4
+	vcmpequd. v0,v0,v1
+	bf	24,7f
+	lvx 	v0,off16,r3
+	lvx 	v1,off16,r4
+	vcmpequd. v0,v0,v1
+	bf	24,6f
+	addi	r3,r3,32
+	addi	r4,r4,32
+	bdnz	5b
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lshort
+
+6:
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+7:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+
+	li	off8,8
+	LD	rA,off8,r3
+	LD	rB,off8,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+#endif
 EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index 193909a..682e386 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	cr1,r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cd..923a9ab 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
 	return 0;
 }
 
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
 {
 	if (in_interrupt())
 		return 0;
-- 
1.8.3.1



More information about the Linuxppc-dev mailing list