[RFC] Add IBM Blue Gene/Q Platform

Jimi Xenidis jimix at pobox.com
Sun Dec 9 09:22:26 EST 2012


On Dec 7, 2012, at 7:38 AM, Jimi Xenidis <jimix at pobox.com> wrote:

> 
> On Dec 6, 2012, at 11:54 PM, Michael Neuling <mikey at neuling.org> wrote:
> 
>>> commit 279c0615917b959a652e81f4ad0d886e2d426d85
>>> Author: Jimi Xenidis <jimix at pobox.com>
>>> Date:   Wed Dec 5 13:43:22 2012 -0500
>>> 
>>>   powerpc/book3e: IBM Blue Gene/Q Quad Processing eXtention (QPX)
>>> 
>>>   This enables kernel support for the QPX extention and is intended for
>>>   processors that support it, usually an IBM Blue Gene processor.
>>>   Turning it on does not effect other processors but it does add code
>>>   and will quadruple the per thread save and restore area for the FPU
>>>   (hense the name).  If you have enabled VSX it will only double the
>>>   space.
>>> 
>>>   Signed-off-by: Jimi Xenidis <jimix at pobox.com>
>> 

<<snip>>
>> 
>> 
>> 
>> +BEGIN_FTR_SECTION							\
>> +	SAVE_32VSRS(n,c,base);						\
>> +END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
>> +BEGIN_FTR_SECTION							\
>> +	SAVE_32QRS(n,c,base);						\
>> +END_FTR_SECTION_IFSET(CPU_FTR_QPX);	
>> 
>> I don't think we want to do this.  We are going to end up with 64
>> NOPS here somewhere.
> 
> Excellent point, NOPs are cheap on most processors but not A2 and a lot of embedded, I can wrap some branches with the FTR instead.
> Do you have a concern on the code size?

Thought about it a bit and came up with this solution for arch/powerpc/kernel/fpu.S.
This should address the following issues
 - MSR_VSX vs MSR_VEC
 - Big chunks of NOPs in the code path
 - Less FTR space fixups at boot time.
 - IMNHSO easier to read especially when disassembled

I did consider using the LR and BLR, but the !CONFIG_SMP case only adds one more special block and uses a different register set.
Also if this is agreeable I would like us to consider removing the *_32FPVSRS* macros entirely and put the FTR tests in the actual code.
This would allow us to use #ifdefs and reduce the amount of code that actually gets compiled.

Thoughts?

diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index e0ada05..5964067 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -25,30 +25,81 @@
 #include <asm/asm-offsets.h>
 #include <asm/ptrace.h>
 
-#ifdef CONFIG_VSX
-#define __REST_32FPVSRS(n,c,base)					\
-BEGIN_FTR_SECTION							\
-	b	2f;							\
-END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
-	REST_32FPRS(n,base);						\
-	b	3f;							\
-2:	REST_32VSRS(n,c,base);						\
-3:
-
-#define __SAVE_32FPVSRS(n,c,base)					\
-BEGIN_FTR_SECTION							\
-	b	2f;							\
-END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
-	SAVE_32FPRS(n,base);						\
-	b	3f;							\
-2:	SAVE_32VSRS(n,c,base);						\
-3:
-#else
-#define __REST_32FPVSRS(n,b,base)	REST_32FPRS(n, base)
-#define __SAVE_32FPVSRS(n,b,base)	SAVE_32FPRS(n, base)
-#endif
-#define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base)
-#define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base)
+
+/*
+ * Restore subroutines, R4 is scratch and R5 is base
+ */
+vsx_restore:
+	REST_32VSRS(0, __REG_R4, __REG_R5)
+	b after_restore
+qpx_restore:
+	REST_32QRS(0, __REG_R4, __REG_R5)
+	b after_restore
+fpu_restore:
+	REST_32FPRS(0, __REG_R5)
+	b after_restore
+
+#define REST_32FPVSRS(n, c, base)		\
+BEGIN_FTR_SECTION				\
+	b vsx_restore;				\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)		\
+BEGIN_FTR_SECTION				\
+	b qpx_restore;				\
+END_FTR_SECTION_IFSET(CPU_FTR_QPX)		\
+	b fpu_restore;				\
+after_restore:
+
+/*
+ * Save subroutines, R4 is scratch and R3 is base
+ */
+vsx_save:
+	SAVE_32VSRS(0, __REG_R4, __REG_R3)
+	b after_save
+qpx_save:
+	SAVE_32QRS(0, __REG_R4, __REG_R3)
+	b after_save
+fpu_save:
+	SAVE_32FPRS(0, __REG_R3)
+	b after_save
+
+#define SAVE_32FPVSRS(n, c, base)		\
+BEGIN_FTR_SECTION				\
+	b vsx_save;				\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)		\
+BEGIN_FTR_SECTION				\
+	b qpx_save;				\
+END_FTR_SECTION_IFSET(CPU_FTR_QPX)		\
+	b fpu_save;				\
+after_save:
+
+#ifndef CONFIG_SMP
+/*
+ * we need an extra save set for the !CONFIG_SMP case, see below
+ * Scratch it R5 and base is R4
+ */
+vsx_save_nosmp:
+	SAVE_32VSRS(0,R5,R4)
+	b after_save_nosmp
+
+qpx_save_nosmp:
+	SAVE_32QRS(0,R5,R4)
+	b after_save_nosmp
+
+fpu_save_nosmp:
+	SAVE_32FPRS(0,R4)
+	b after_save_nosmp
+
+#define SAVE_32FPVSRS_NOSMP(n,c,base)		\
+BEGIN_FTR_SECTION				\
+	b vsx_save_nosmp;			\
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)		\
+BEGIN_FTR_SECTION				\
+	b qpx_save_nosmp;			\
+END_FTR_SECTION_IFSET(CPU_FTR_QPX)		\
+	b fpu_save_nosmp;			\
+after_save_nosmp:
+
+#endif /* !CONFIG_SMP */
 
 /*
  * This task wants to use the FPU now.
@@ -65,6 +116,11 @@ BEGIN_FTR_SECTION
 	oris	r5,r5,MSR_VSX at h
 END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif
+#ifdef CONFIG_PPC_QPX
+BEGIN_FTR_SECTION
+	oris	r5,r5,MSR_VEC at h
+END_FTR_SECTION_IFSET(CPU_FTR_QPX)
+#endif
 	SYNC
 	MTMSRD(r5)			/* enable use of fpu now */
 	isync
@@ -81,7 +137,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	beq	1f
 	toreal(r4)
 	addi	r4,r4,THREAD		/* want last_task_used_math->thread */
-	SAVE_32FPVSRS(0, R5, R4)
+	SAVE_32FPVSRS_NOSMP(0, R5, R4)
 	mffs	fr0
 	stfd	fr0,THREAD_FPSCR(r4)
 	PPC_LL	r5,PT_REGS(r4)
@@ -94,7 +150,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif /* CONFIG_SMP */
 	/* enable use of FP after return */
 #ifdef CONFIG_PPC32
-	mfspr	r5,SPRN_SPRG_THREAD		/* current task's THREAD (phys) */
+	mfspr	r5,SPRN_SPRG_THREAD	/* current task's THREAD (phys) */
 	lwz	r4,THREAD_FPEXC_MODE(r5)
 	ori	r9,r9,MSR_FP		/* enable FP for current */
 	or	r9,r9,r4
@@ -132,6 +188,11 @@ BEGIN_FTR_SECTION
 	oris	r5,r5,MSR_VSX at h
 END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif
+#ifdef CONFIG_PPC_QPX
+BEGIN_FTR_SECTION
+	oris	r5,r5,MSR_VEC at h
+END_FTR_SECTION_IFSET(CPU_FTR_QPX)
+#endif
 	SYNC_601
 	ISYNC_601
 	MTMSRD(r5)			/* enable use of fpu now */
@@ -148,10 +209,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	beq	1f
 	PPC_LL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
 	li	r3,MSR_FP|MSR_FE0|MSR_FE1
-#ifdef CONFIG_VSX
+#if defined(CONFIG_VSX) || defined(CONFIG_PPC_QPX)
 BEGIN_FTR_SECTION
 	oris	r3,r3,MSR_VSX at h
-END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+END_FTR_SECTION_IFSET(CPU_FTR_VSX | CPU_FTR_QPX)
 #endif
 	andc	r4,r4,r3		/* disable FP for previous task */
 	PPC_STL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)

-jx


> 
>> 
>> I'd like to see this patch broken into different parts.
> 
> I'm not sure how _this_ patch:
>  <https://github.com/jimix/linux-bgq/commit/279c0615917b959a652e81f4ad0d886e2d426d85>
> could be broken up, please advise.
> 
>> 
>> Also, have you boot tested this change on a VSX enabled box?
> 
> I can try, I may bug you for help.
> Is there a commonly test (or apps) I should run?
> 
> -jx
> 
> 
>> 
>> Mikey
> 



More information about the Linuxppc-dev mailing list