powerpc32: rearrange instructions order in ip_fast_csum()

Fri Sep 19 23:57:56 EST 2014

On PPC_8xx, lwz has a 2 cycles latency, and branching also takes 2 cycles.
As the size of the header is minimum 5 words, we can unroll the loop for the
first words to reduce number of branching, and we can re-order the instructions
to limit loading latency.

Signed-off-by: Christophe Leroy <christophe.leroy at c-s.fr>

---
 arch/powerpc/lib/checksum_32.S | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 6d67e05..5500704 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -26,13 +26,17 @@
 _GLOBAL(ip_fast_csum)
 	lwz	r0,0(r3)
 	lwzu	r5,4(r3)
-	addic.	r4,r4,-2
+	addic.	r4,r4,-4
 	addc	r0,r0,r5
 	mtctr	r4
 	blelr-
-1:	lwzu	r4,4(r3)
-	adde	r0,r0,r4
+	lwzu	r5,4(r3)
+	lwzu	r4,4(r3)
+	adde	r0,r0,r5
+1:	adde	r0,r0,r4
+	lwzu	r4,4(r3)
 	bdnz	1b
+	adde	r0,r0,r4
 	addze	r0,r0		/* add in final carry */
 	rlwinm	r3,r0,16,0,31	/* fold two halves together */
 	add	r3,r0,r3
-- 
2.1.0