[PATCH] powerpc: Add POWER9 copy_page() loop
Anton Blanchard
anton at ozlabs.org
Tue Mar 21 10:40:46 AEDT 2017
From: Anton Blanchard <anton at samba.org>
Add a POWER9 optimised copy_page() loop. This loop uses the new D form
vector loads and stores, and uses dcbz to pre zero the destination.
A few questions:
- I'm using a nested feature section, but that is going to get unwieldy
at some stage. It would be nice to update the call site for copy_page
directly.
- I'm using CPU_FTR_ARCH_300, but as our functions grow perhaps we want
the cputable entry to contain a pointer to optimised functions.
Signed-off-by: Anton Blanchard <anton at samba.org>
---
arch/powerpc/lib/Makefile | 2 +-
arch/powerpc/lib/copypage_64.S | 4 +
arch/powerpc/lib/copypage_power9.S | 224 +++++++++++++++++++++++++++++++++++++
3 files changed, 229 insertions(+), 1 deletion(-)
create mode 100644 arch/powerpc/lib/copypage_power9.S
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 2b5e090..d3667b5 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o
obj64-y += copypage_64.o copyuser_64.o usercopy_64.o mem_64.o hweight_64.o \
copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \
- memcpy_64.o memcmp_64.o
+ memcpy_64.o memcmp_64.o copypage_power9.o
obj64-$(CONFIG_SMP) += locks.o
obj64-$(CONFIG_ALTIVEC) += vmx-helper.o
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 4bcc9e7..051423e 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -21,7 +21,11 @@ _GLOBAL_TOC(copy_page)
BEGIN_FTR_SECTION
lis r5,PAGE_SIZE at h
FTR_SECTION_ELSE
+ BEGIN_FTR_SECTION_NESTED(50)
+ b copypage_power9
+ FTR_SECTION_ELSE_NESTED(50)
b copypage_power7
+ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50)
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
ori r5,r5,PAGE_SIZE at l
BEGIN_FTR_SECTION
diff --git a/arch/powerpc/lib/copypage_power9.S b/arch/powerpc/lib/copypage_power9.S
new file mode 100644
index 0000000..2493f94
--- /dev/null
+++ b/arch/powerpc/lib/copypage_power9.S
@@ -0,0 +1,224 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Anton Blanchard <anton at au.ibm.com>
+ */
+#include <asm/page.h>
+#include <asm/ppc_asm.h>
+
+_GLOBAL(copypage_power9)
+ /*
+ * We prefetch the source using enhanced touch instructions. We use
+ * a stream ID of 0 for this. Since the source is page aligned we
+ * don't need to clear the bottom 7 bits of the address.
+ */
+#ifdef CONFIG_PPC_64K_PAGES
+ lis r7,0x0E01 /* depth=7
+ * units/cachelines=512 */
+#else
+ lis r7,0x0E00 /* depth=7 */
+ ori r7,r7,0x1000 /* units/cachelines=32 */
+#endif
+
+ lis r8,0x8000 /* GO=1 */
+ clrldi r8,r8,32
+
+.machine push
+.machine "power4"
+ /* setup read stream 0 */
+ dcbt r0,r4,0b01000 /* addr from */
+ dcbt r0,r7,0b01010 /* length and depth from */
+ eieio
+ dcbt r0,r8,0b01010 /* all streams GO */
+ eieio
+.machine pop
+
+ /*
+ * To reduce memory bandwidth on the store side we send dcbzs ahead.
+ * Experimental testing shows 2 cachelines as good enough.
+ */
+ li r6,128
+ dcbz 0,r3
+ dcbz r6,r3
+
+#ifdef CONFIG_ALTIVEC
+ mflr r0
+ std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+ std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+ std r0,16(r1)
+ stdu r1,-STACKFRAMESIZE(r1)
+ bl enter_vmx_copy
+ cmpwi r3,0
+ ld r0,STACKFRAMESIZE+16(r1)
+ ld r3,STK_REG(R31)(r1)
+ ld r4,STK_REG(R30)(r1)
+ addi r1,r1,STACKFRAMESIZE
+ mtlr r0
+
+ li r0,((PAGE_SIZE/128)-2)
+ mtctr r0
+
+ li r8,256
+
+ beq .Lnonvmx_copy
+
+ .balign 16
+1: dcbz r8,r3
+ lxv vs32,0(r4)
+ lxv vs33,16(r4)
+ stxv vs32,0(r3)
+ stxv vs33,16(r3)
+
+ lxv vs34,32(r4)
+ lxv vs35,48(r4)
+ stxv vs34,32(r3)
+ stxv vs35,48(r3)
+
+ lxv vs36,64(r4)
+ lxv vs37,80(r4)
+ stxv vs36,64(r3)
+ stxv vs37,80(r3)
+
+ lxv vs38,96(r4)
+ lxv vs39,112(r4)
+ stxv vs38,96(r3)
+ stxv vs39,112(r3)
+
+ addi r4,r4,128
+ addi r3,r3,128
+ bdnz 1b
+
+ li r0,2
+ mtctr r0
+
+1: lxv vs32,0(r4)
+ lxv vs33,16(r4)
+ stxv vs32,0(r3)
+ stxv vs33,16(r3)
+
+ lxv vs34,32(r4)
+ lxv vs35,48(r4)
+ stxv vs34,32(r3)
+ stxv vs35,48(r3)
+
+ lxv vs36,64(r4)
+ lxv vs37,80(r4)
+ stxv vs36,64(r3)
+ stxv vs37,80(r3)
+
+ lxv vs38,96(r4)
+ lxv vs39,112(r4)
+ stxv vs38,96(r3)
+ stxv vs39,112(r3)
+
+ addi r4,r4,128
+ addi r3,r3,128
+ bdnz 1b
+
+ b exit_vmx_copy /* tail call optimise */
+#else
+ li r0,((PAGE_SIZE/128)-2)
+ mtctr r0
+
+ li r8,256
+#endif
+
+ .balign 16
+.Lnonvmx_copy:
+1: dcbz r8,r3
+ ld r0,0(r4)
+ ld r5,8(r4)
+ ld r6,16(r4)
+ ld r7,24(r4)
+ std r0,0(r3)
+ std r5,8(r3)
+ std r6,16(r3)
+ std r7,24(r3)
+
+ ld r0,32(r4)
+ ld r5,40(r4)
+ ld r6,48(r4)
+ ld r7,56(r4)
+ std r0,32(r3)
+ std r5,40(r3)
+ std r6,48(r3)
+ std r7,56(r3)
+
+ ld r0,64(r4)
+ ld r5,72(r4)
+ ld r6,80(r4)
+ ld r7,88(r4)
+ std r0,64(r3)
+ std r5,72(r3)
+ std r6,80(r3)
+ std r7,88(r3)
+
+ ld r0,96(r4)
+ ld r5,104(r4)
+ ld r6,112(r4)
+ ld r7,120(r4)
+ addi r4,r4,128
+ std r0,96(r3)
+ std r5,104(r3)
+ std r6,112(r3)
+ std r7,120(r3)
+ addi r3,r3,128
+ bdnz 1b
+
+ li r0,2
+ mtctr r0
+
+1: ld r0,0(r4)
+ ld r5,8(r4)
+ ld r6,16(r4)
+ ld r7,24(r4)
+ std r0,0(r3)
+ std r5,8(r3)
+ std r6,16(r3)
+ std r7,24(r3)
+
+ ld r0,32(r4)
+ ld r5,40(r4)
+ ld r6,48(r4)
+ ld r7,56(r4)
+ std r0,32(r3)
+ std r5,40(r3)
+ std r6,48(r3)
+ std r7,56(r3)
+
+ ld r0,64(r4)
+ ld r5,72(r4)
+ ld r6,80(r4)
+ ld r7,88(r4)
+ std r0,64(r3)
+ std r5,72(r3)
+ std r6,80(r3)
+ std r7,88(r3)
+
+ ld r0,96(r4)
+ ld r5,104(r4)
+ ld r6,112(r4)
+ ld r7,120(r4)
+ addi r4,r4,128
+ std r0,96(r3)
+ std r5,104(r3)
+ std r6,112(r3)
+ std r7,120(r3)
+ addi r3,r3,128
+ bdnz 1b
+
+ blr
--
2.9.3
More information about the Linuxppc-dev
mailing list