[PATCH 3/4] powerpc/32: optimise memset()

Christophe Leroy christophe.leroy at c-s.fr
Thu Aug 24 00:54:36 AEST 2017


There is no need to extend the set value to an int when the length
is lower than 4 as in that case we only do byte stores.
We can therefore immediately branch to the part handling it.
By separating it from the normal case, we are able to eliminate
a few actions on the destination pointer.

Signed-off-by: Christophe Leroy <christophe.leroy at c-s.fr>
---
 arch/powerpc/lib/copy_32.S | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index a3ffeac69eca..05aaee20590f 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -91,17 +91,17 @@ EXPORT_SYMBOL(memset16)
  * replaced by a nop once cache is active. This is done in machine_init()
  */
 _GLOBAL(memset)
+	cmplwi	0,r5,4
+	blt	7f
+
 	rlwimi	r4,r4,8,16,23
 	rlwimi	r4,r4,16,0,15
 
-	addi	r6,r3,-4
-	cmplwi	0,r5,4
-	blt	7f
-	stwu	r4,4(r6)
+	stw	r4,0(r3)
 	beqlr
-	andi.	r0,r6,3
+	andi.	r0,r3,3
 	add	r5,r0,r5
-	subf	r6,r0,r6
+	subf	r6,r0,r3
 	cmplwi	0,r4,0
 	bne	2f	/* Use normal procedure if r4 is not zero */
 _GLOBAL(memset_nocache_branch)
@@ -132,13 +132,20 @@ _GLOBAL(memset_nocache_branch)
 1:	stwu	r4,4(r6)
 	bdnz	1b
 6:	andi.	r5,r5,3
-7:	cmpwi	0,r5,0
 	beqlr
 	mtctr	r5
 	addi	r6,r6,3
 8:	stbu	r4,1(r6)
 	bdnz	8b
 	blr
+
+7:	cmpwi	0,r5,0
+	beqlr
+	mtctr	r5
+	addi	r6,r3,-1
+9:	stbu	r4,1(r6)
+	bdnz	9b
+	blr
 EXPORT_SYMBOL(memset)
 
 /*
-- 
2.13.3



More information about the Linuxppc-dev mailing list