[PATCH 2/2] powerpc: Batch up loads/stores on saving and restoring VSX

Cyril Bur cyrilbur at gmail.com
Tue Mar 1 16:55:35 AEDT 2016


Currently the assembly to save and restore VSX registers boils down to a
load immediate of the offset of the specific VSX register in memory
followed by the load/store which repeats in sequence for each VSX register.

This patch attempts to do better by loading up four registers with
immediates so that the loads and stores can be batched up and better
pipelined by the processor.

This patch results in four load/stores in sequence and one add between
groups of four. Also, by using a pair of base registers it means that the
result of the add is not needed by the following instruction.

Due to the overlapping layout of FPU registers and VSX registers on POWER
chips, this patch also benefits FPU loads and stores when VSX is compiled
in and the CPU is VSX capable.

Signed-off-by: Cyril Bur <cyrilbur at gmail.com>
---
 arch/powerpc/include/asm/ppc_asm.h | 65 ++++++++++++++++++++++++++++++--------
 arch/powerpc/kernel/fpu.S          | 43 ++++++++++++++++---------
 arch/powerpc/kernel/tm.S           | 46 ++++++++++++++-------------
 3 files changed, 104 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 5ba69ed..dd0df12 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -173,19 +173,58 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #define LXVD2X_ROT(n,b,base)		LXVD2X(n,b,base);	\
 					XXSWAPD(n,n)
 #endif
-/* Save the lower 32 VSRs in the thread VSR region */
-#define SAVE_VSR(n,b,base)	li b,16*(n);  STXVD2X_ROT(n,R##base,R##b)
-#define SAVE_2VSRS(n,b,base)	SAVE_VSR(n,b,base); SAVE_VSR(n+1,b,base)
-#define SAVE_4VSRS(n,b,base)	SAVE_2VSRS(n,b,base); SAVE_2VSRS(n+2,b,base)
-#define SAVE_8VSRS(n,b,base)	SAVE_4VSRS(n,b,base); SAVE_4VSRS(n+4,b,base)
-#define SAVE_16VSRS(n,b,base)	SAVE_8VSRS(n,b,base); SAVE_8VSRS(n+8,b,base)
-#define SAVE_32VSRS(n,b,base)	SAVE_16VSRS(n,b,base); SAVE_16VSRS(n+16,b,base)
-#define REST_VSR(n,b,base)	li b,16*(n); LXVD2X_ROT(n,R##base,R##b)
-#define REST_2VSRS(n,b,base)	REST_VSR(n,b,base); REST_VSR(n+1,b,base)
-#define REST_4VSRS(n,b,base)	REST_2VSRS(n,b,base); REST_2VSRS(n+2,b,base)
-#define REST_8VSRS(n,b,base)	REST_4VSRS(n,b,base); REST_4VSRS(n+4,b,base)
-#define REST_16VSRS(n,b,base)	REST_8VSRS(n,b,base); REST_8VSRS(n+8,b,base)
-#define REST_32VSRS(n,b,base)	REST_16VSRS(n,b,base); REST_16VSRS(n+16,b,base)
+
+#define __SAVE_4VSRS(n,off0,off1,off2,off3,base) \
+	STXVD2X_ROT(n,R##base,R##off0); \
+	STXVD2X_ROT(n+1,R##base,R##off1); \
+	STXVD2X_ROT(n+2,R##base,R##off2); \
+	STXVD2X_ROT(n+3,R##base,R##off3)
+
+/* Restores the base for the caller */
+#define SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base) \
+	addi reg4,base,64; \
+	li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
+	__SAVE_4VSRS(0,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__SAVE_4VSRS(4,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__SAVE_4VSRS(8,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__SAVE_4VSRS(12,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__SAVE_4VSRS(16,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__SAVE_4VSRS(20,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__SAVE_4VSRS(24,reg0,reg1,reg2,reg3,base); \
+	__SAVE_4VSRS(28,reg0,reg1,reg2,reg3,reg4); \
+	subi base,base,384
+
+#define __REST_4VSRS(n,off0,off1,off2,off3,base) \
+	LXVD2X_ROT(n,R##base,R##off0); \
+	LXVD2X_ROT(n+1,R##base,R##off1); \
+	LXVD2X_ROT(n+2,R##base,R##off2); \
+	LXVD2X_ROT(n+3,R##base,R##off3)
+
+/* Restores the base for the caller */
+#define REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base) \
+	addi reg4,base,64; \
+	li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
+	__REST_4VSRS(0,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__REST_4VSRS(4,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__REST_4VSRS(8,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__REST_4VSRS(12,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__REST_4VSRS(16,reg0,reg1,reg2,reg3,base); \
+	addi base,base,128; \
+	__REST_4VSRS(20,reg0,reg1,reg2,reg3,reg4); \
+	addi reg4,reg4,128; \
+	__REST_4VSRS(24,reg0,reg1,reg2,reg3,base); \
+	__REST_4VSRS(28,reg0,reg1,reg2,reg3,reg4); \
+	subi base,base,384
 
 /*
  * b = base register for addressing, o = base offset from register of 1st EVR
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 15da2b5..dc57ff1 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -26,29 +26,32 @@
 #include <asm/ptrace.h>
 
 #ifdef CONFIG_VSX
-#define __REST_32FPVSRS(n,c,base)					\
+#define __REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)	\
 BEGIN_FTR_SECTION							\
 	b	2f;							\
 END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
-	REST_32FPRS(n,base);						\
+	REST_32FPRS(0,base);						\
 	b	3f;							\
-2:	REST_32VSRS(n,c,base);						\
+2:	REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
 3:
 
-#define __SAVE_32FPVSRS(n,c,base)					\
+#define __SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
 BEGIN_FTR_SECTION							\
 	b	2f;							\
 END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
-	SAVE_32FPRS(n,base);						\
+	SAVE_32FPRS(0,base);						\
 	b	3f;							\
-2:	SAVE_32VSRS(n,c,base);						\
+2:	SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
 3:
 #else
-#define __REST_32FPVSRS(n,b,base)	REST_32FPRS(n, base)
-#define __SAVE_32FPVSRS(n,b,base)	SAVE_32FPRS(n, base)
+#define __REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)	REST_32FPRS(0, base)
+#define __SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base)	SAVE_32FPRS(0, base)
 #endif
-#define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base)
-#define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base)
+#define REST_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
+__REST_32FPVSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
+
+#define SAVE_32FPVSRS(reg0,reg1,reg2,reg3,reg4,base) \
+__SAVE_32FPVSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 /* void do_load_up_transact_fpu(struct thread_struct *thread)
@@ -56,6 +59,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
  * This is similar to load_up_fpu but for the transactional version of the FP
  * register set.  It doesn't mess with the task MSR or valid flags.
  * Furthermore, we don't do lazy FP with TM currently.
+ *
+ * Is called from C
  */
 _GLOBAL(do_load_up_transact_fpu)
 	mfmsr	r6
@@ -71,7 +76,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	addi	r7,r3,THREAD_TRANSACT_FPSTATE
 	lfd	fr0,FPSTATE_FPSCR(r7)
 	MTFSF_L(fr0)
-	REST_32FPVSRS(0, R4, R7)
+	REST_32FPVSRS(R4,R5,R6,R8,R9,R7)
 
 	blr
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
@@ -79,19 +84,23 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 /*
  * Load state from memory into FP registers including FPSCR.
  * Assumes the caller has enabled FP in the MSR.
+ *
+ * Is called from C
  */
 _GLOBAL(load_fp_state)
 	lfd	fr0,FPSTATE_FPSCR(r3)
 	MTFSF_L(fr0)
-	REST_32FPVSRS(0, R4, R3)
+	REST_32FPVSRS(R4,R5,R6,R7,R8,R3)
 	blr
 
 /*
  * Store FP state into memory, including FPSCR
  * Assumes the caller has enabled FP in the MSR.
+ *
+ * NOT called from C
  */
 _GLOBAL(store_fp_state)
-	SAVE_32FPVSRS(0, R4, R3)
+	SAVE_32FPVSRS(R4,R5,R6,R7,R8,R3)
 	mffs	fr0
 	stfd	fr0,FPSTATE_FPSCR(r3)
 	blr
@@ -104,6 +113,8 @@ _GLOBAL(store_fp_state)
  * enable the FPU for the current task and return to the task.
  * Note that on 32-bit this can only use registers that will be
  * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
+ *
+ * NOT called from C
  */
 _GLOBAL(load_up_fpu)
 	mfmsr	r5
@@ -137,7 +148,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	addi	r10,r5,THREAD_FPSTATE
 	lfd	fr0,FPSTATE_FPSCR(r10)
 	MTFSF_L(fr0)
-	REST_32FPVSRS(0, R4, R10)
+	REST_32FPVSRS(R3,R4,R5,R6,R11,R10)
 	/* restore registers and return */
 	/* we haven't used ctr or xer or lr */
 	blr
@@ -146,6 +157,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
  * save_fpu(tsk)
  * Save the floating-point registers in its thread_struct.
  * Enables the FPU for use in the kernel on return.
+ *
+ * Is called from C
  */
 _GLOBAL(save_fpu)
 	addi	r3,r3,THREAD	        /* want THREAD of task */
@@ -154,7 +167,7 @@ _GLOBAL(save_fpu)
 	PPC_LCMPI	0,r6,0
 	bne	2f
 	addi	r6,r3,THREAD_FPSTATE
-2:	SAVE_32FPVSRS(0, R4, R6)
+2:	SAVE_32FPVSRS(R4,R5,R7,R8,R9,R6)
 	mffs	fr0
 	stfd	fr0,FPSTATE_FPSCR(r6)
 	blr
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 81e1305..61900b8 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -14,30 +14,32 @@
 
 #ifdef CONFIG_VSX
 /* See fpu.S, this is borrowed from there */
-#define __SAVE_32FPRS_VSRS(n,c,base)		\
-BEGIN_FTR_SECTION				\
-	b	2f;				\
-END_FTR_SECTION_IFSET(CPU_FTR_VSX);		\
-	SAVE_32FPRS(n,base);			\
-	b	3f;				\
-2:	SAVE_32VSRS(n,c,base);			\
+#define __SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
+BEGIN_FTR_SECTION							\
+	b	2f;							\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
+	SAVE_32FPRS(0,base);						\
+	b	3f;							\
+2:	SAVE_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
 3:
-#define __REST_32FPRS_VSRS(n,c,base)		\
-BEGIN_FTR_SECTION				\
-	b	2f;				\
-END_FTR_SECTION_IFSET(CPU_FTR_VSX);		\
-	REST_32FPRS(n,base);			\
-	b	3f;				\
-2:	REST_32VSRS(n,c,base);			\
+
+#define __REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)	\
+BEGIN_FTR_SECTION							\
+	b	2f;							\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
+	REST_32FPRS(0,base);						\
+	b	3f;							\
+2:	REST_32VSRS(reg0,reg1,reg2,reg3,reg4,base); \
 3:
+
 #else
-#define __SAVE_32FPRS_VSRS(n,c,base)	SAVE_32FPRS(n, base)
-#define __REST_32FPRS_VSRS(n,c,base)	REST_32FPRS(n, base)
+#define __SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)	SAVE_32FPRS(0, base)
+#define __REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base)	REST_32FPRS(0, base)
 #endif
-#define SAVE_32FPRS_VSRS(n,c,base) \
-	__SAVE_32FPRS_VSRS(n,__REG_##c,__REG_##base)
-#define REST_32FPRS_VSRS(n,c,base) \
-	__REST_32FPRS_VSRS(n,__REG_##c,__REG_##base)
+#define SAVE_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
+__SAVE_32FPRS_VSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
+#define REST_32FPRS_VSRS(reg0,reg1,reg2,reg3,reg4,base) \
+__REST_32FPRS_VSRS(__REG_##reg0,__REG_##reg1,__REG_##reg2,__REG_##reg3,__REG_##reg4,__REG_##base)
 
 /* Stack frame offsets for local variables. */
 #define TM_FRAME_L0	TM_FRAME_SIZE-16
@@ -165,7 +167,7 @@ dont_backup_vec:
 	beq	dont_backup_fp
 
 	addi	r7, r3, THREAD_TRANSACT_FPSTATE
-	SAVE_32FPRS_VSRS(0, R6, R7)	/* r6 scratch, r7 transact fp state */
+	SAVE_32FPRS_VSRS(R6,R8,R9,R10,R11,R7) /* r6,r8,r9,r10,r11 scratch, r7 transact fp state */
 
 	mffs    fr0
 	stfd    fr0,FPSTATE_FPSCR(r7)
@@ -375,7 +377,7 @@ dont_restore_vec:
 	addi	r8, r3, THREAD_FPSTATE
 	lfd	fr0, FPSTATE_FPSCR(r8)
 	MTFSF_L(fr0)
-	REST_32FPRS_VSRS(0, R4, R8)
+	REST_32FPRS_VSRS(R4,R5,R6,R7,R9,R8)
 
 dont_restore_fp:
 	mtmsr	r6				/* FP/Vec off again! */
-- 
2.7.2



More information about the Linuxppc-dev mailing list