[RFC] Moving toward smarter disabling of FPRs, VRs, and VSRs in the MSR

Michael Neuling mikey at neuling.org
Mon Mar 16 17:43:02 EST 2009


> > We can do some VMX testing on existing POWER6 machines.  The VSX
> > instruction set hasn't been fully implemented in GCC yet so we'll need
> > to wait a bit for that.  Does anyone have an idea for a good VMX/Altivec
> > benchmark?
> 
> Note that there are two aspects to the problem:
> 
>  - Lazy save/restore on SMP. This would avoid both the save and restore
> phases, thus is where the most potential gain is to be made. At the
> expense of some tricky IPI work when processes migrate between CPUs.
> 
> However, it will only be useful -if- a process using FP/VMX/VSX is
> "interrupted" by another process that isn't using them. For example, a
> kernel thread. So it's unclear whether that's worth it in practice, ie,
> does this happen that often ?
> 
>  - Always restoring the FP/VMX/VSX state on context switch "in" rather
> than taking a fault. This is reasonably simple, but at the potential
> expense of adding the save/restore overhead to applications that only
> seldomly use these facilities. (Some heuristics might help here).
> 
> However, the question here what do this buy us ?
> 
> IE, In the worst case scenario, which is HZ=1000, so every 1ms, the
> process would have the overhead of an interrupt to do the restore of the
> state. IE. The restore state itself doesn't count since it would be done
> either way (at context switch vs. in the unavailable interrupt), so all
> we win here is the overhead of the actual interrupt, which is
> implemented as a fast interrupt in assembly. So we have what here ? 1000
> cycles to be pessimistic ? On a 1Ghz CPU, that is 1/1000 of the time
> slice, and both of these are rather pessimistic numbers.
> 
> So that leaves us with the possible case of 2 tasks using the facility
> and running a fraction of the timeslice each, for example because they
> are ping-ponging with each other.
> 
> Is that something that happens in practice to make it noticeable ?

I hacked up the below to put stats in /proc/self/sched.

A quick grep through /proc on a rhel5.2 machine (egrep
'(fp_count|switch_count)' /proc/*/sched) shows a few apps use fp a few
dozen times but then stop.  This is only init apps like hald, so need to
check some real world apps too.

Ryan: let me know if this allows you to collect some useful stats.  

Subject: [PATCH] powerpc: add context switch, fpr & vr stats to /proc/self/sched.

Add a counter for every task switch, fp and vr exception to
/proc/self/sched.

[root at p5-20-p6-e0 ~]# cat /proc/3422/sched |tail -3
switch_count                       :                  559
fp_count                           :                  317
vr_count                           :                    0
[root at p5-20-p6-e0 ~]# 

Signed-off-by: Michael Neuling <mikey at neuling.org>
---
 arch/powerpc/include/asm/processor.h |    3 +++
 arch/powerpc/kernel/asm-offsets.c    |    3 +++
 arch/powerpc/kernel/fpu.S            |    3 +++
 arch/powerpc/kernel/process.c        |    3 +++
 arch/powerpc/kernel/setup-common.c   |   10 ++++++++++
 include/linux/seq_file.h             |   12 ++++++++++++
 kernel/sched_debug.c                 |   16 ++++------------
 7 files changed, 38 insertions(+), 12 deletions(-)

Index: linux-2.6-ozlabs/arch/powerpc/include/asm/processor.h
===================================================================
--- linux-2.6-ozlabs.orig/arch/powerpc/include/asm/processor.h
+++ linux-2.6-ozlabs/arch/powerpc/include/asm/processor.h
@@ -174,11 +174,13 @@ struct thread_struct {
 	} fpscr;
 	int		fpexc_mode;	/* floating-point exception mode */
 	unsigned int	align_ctl;	/* alignment handling control */
+	unsigned long	fp_count;	/* FP restore count */
 #ifdef CONFIG_PPC64
 	unsigned long	start_tb;	/* Start purr when proc switched in */
 	unsigned long	accum_tb;	/* Total accumilated purr for process */
 #endif
 	unsigned long	dabr;		/* Data address breakpoint register */
+	unsigned long	switch_count;	/* switch count */
 #ifdef CONFIG_ALTIVEC
 	/* Complete AltiVec register set */
 	vector128	vr[32] __attribute__((aligned(16)));
@@ -186,6 +188,7 @@ struct thread_struct {
 	vector128	vscr __attribute__((aligned(16)));
 	unsigned long	vrsave;
 	int		used_vr;	/* set if process has used altivec */
+	unsigned long	vr_count;	/* VSX restore count */
 #endif /* CONFIG_ALTIVEC */
 #ifdef CONFIG_VSX
 	/* VSR status */
Index: linux-2.6-ozlabs/arch/powerpc/kernel/asm-offsets.c
===================================================================
--- linux-2.6-ozlabs.orig/arch/powerpc/kernel/asm-offsets.c
+++ linux-2.6-ozlabs/arch/powerpc/kernel/asm-offsets.c
@@ -74,14 +74,17 @@ int main(void)
 	DEFINE(KSP, offsetof(struct thread_struct, ksp));
 	DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit));
 	DEFINE(PT_REGS, offsetof(struct thread_struct, regs));
+	DEFINE(THREAD_SWITCHCOUNT, offsetof(struct thread_struct, switch_count));
 	DEFINE(THREAD_FPEXC_MODE, offsetof(struct thread_struct, fpexc_mode));
 	DEFINE(THREAD_FPR0, offsetof(struct thread_struct, fpr[0]));
 	DEFINE(THREAD_FPSCR, offsetof(struct thread_struct, fpscr));
+	DEFINE(THREAD_FPCOUNT, offsetof(struct thread_struct, fp_count));
 #ifdef CONFIG_ALTIVEC
 	DEFINE(THREAD_VR0, offsetof(struct thread_struct, vr[0]));
 	DEFINE(THREAD_VRSAVE, offsetof(struct thread_struct, vrsave));
 	DEFINE(THREAD_VSCR, offsetof(struct thread_struct, vscr));
 	DEFINE(THREAD_USED_VR, offsetof(struct thread_struct, used_vr));
+	DEFINE(THREAD_VRCOUNT, offsetof(struct thread_struct, vr_count));
 #endif /* CONFIG_ALTIVEC */
 #ifdef CONFIG_VSX
 	DEFINE(THREAD_VSR0, offsetof(struct thread_struct, fpr));
Index: linux-2.6-ozlabs/arch/powerpc/kernel/fpu.S
===================================================================
--- linux-2.6-ozlabs.orig/arch/powerpc/kernel/fpu.S
+++ linux-2.6-ozlabs/arch/powerpc/kernel/fpu.S
@@ -102,6 +102,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	ori	r12,r12,MSR_FP
 	or	r12,r12,r4
 	std	r12,_MSR(r1)
+	ld	r4,THREAD_FPCOUNT(r5)
+	addi	r4, r4, 1
+	std	r4,THREAD_FPCOUNT(r5)
 #endif
 	lfd	fr0,THREAD_FPSCR(r5)
 	MTFSF_L(fr0)
Index: linux-2.6-ozlabs/arch/powerpc/kernel/process.c
===================================================================
--- linux-2.6-ozlabs.orig/arch/powerpc/kernel/process.c
+++ linux-2.6-ozlabs/arch/powerpc/kernel/process.c
@@ -744,17 +744,20 @@ void start_thread(struct pt_regs *regs, 
 #endif
 
 	discard_lazy_cpu_state();
+	current->thread.switch_count = 0;
 #ifdef CONFIG_VSX
 	current->thread.used_vsr = 0;
 #endif
 	memset(current->thread.fpr, 0, sizeof(current->thread.fpr));
 	current->thread.fpscr.val = 0;
+	current->thread.fp_count = 0;
 #ifdef CONFIG_ALTIVEC
 	memset(current->thread.vr, 0, sizeof(current->thread.vr));
 	memset(&current->thread.vscr, 0, sizeof(current->thread.vscr));
 	current->thread.vscr.u[3] = 0x00010000; /* Java mode disabled */
 	current->thread.vrsave = 0;
 	current->thread.used_vr = 0;
+	current->thread.vr_count = 0;
 #endif /* CONFIG_ALTIVEC */
 #ifdef CONFIG_SPE
 	memset(current->thread.evr, 0, sizeof(current->thread.evr));
Index: linux-2.6-ozlabs/arch/powerpc/kernel/setup-common.c
===================================================================
--- linux-2.6-ozlabs.orig/arch/powerpc/kernel/setup-common.c
+++ linux-2.6-ozlabs/arch/powerpc/kernel/setup-common.c
@@ -669,3 +669,13 @@ static int powerpc_debugfs_init(void)
 }
 arch_initcall(powerpc_debugfs_init);
 #endif
+
+void arch_proc_sched_show_task(struct task_struct *p, struct seq_file *m) {
+	SEQ_printf(m, "%-35s:%21Ld\n",
+		   "switch_count", (long long)p->thread.switch_count);
+	SEQ_printf(m, "%-35s:%21Ld\n",
+		   "fp_count", (long long)p->thread.fp_count);
+	SEQ_printf(m, "%-35s:%21Ld\n",
+		   "vr_count", (long long)p->thread.vr_count);
+}
+
Index: linux-2.6-ozlabs/include/linux/seq_file.h
===================================================================
--- linux-2.6-ozlabs.orig/include/linux/seq_file.h
+++ linux-2.6-ozlabs/include/linux/seq_file.h
@@ -95,4 +95,16 @@ extern struct list_head *seq_list_start_
 extern struct list_head *seq_list_next(void *v, struct list_head *head,
 		loff_t *ppos);
 
+/*
+ * This allows printing both to /proc/sched_debug and
+ * to the console
+ */
+#define SEQ_printf(m, x...)			\
+ do {						\
+	if (m)					\
+		seq_printf(m, x);		\
+	else					\
+		printk(x);			\
+ } while (0)
+
 #endif
Index: linux-2.6-ozlabs/kernel/sched_debug.c
===================================================================
--- linux-2.6-ozlabs.orig/kernel/sched_debug.c
+++ linux-2.6-ozlabs/kernel/sched_debug.c
@@ -17,18 +17,6 @@
 #include <linux/utsname.h>
 
 /*
- * This allows printing both to /proc/sched_debug and
- * to the console
- */
-#define SEQ_printf(m, x...)			\
- do {						\
-	if (m)					\
-		seq_printf(m, x);		\
-	else					\
-		printk(x);			\
- } while (0)
-
-/*
  * Ease the printing of nsec fields:
  */
 static long long nsec_high(unsigned long long nsec)
@@ -370,6 +358,9 @@ static int __init init_sched_debug_procf
 
 __initcall(init_sched_debug_procfs);
 
+void __attribute__ ((weak))
+arch_proc_sched_show_task(struct task_struct *p, struct seq_file *m) {}
+
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
 	unsigned long nr_switches;
@@ -473,6 +464,7 @@ void proc_sched_show_task(struct task_st
 		SEQ_printf(m, "%-35s:%21Ld\n",
 			   "clock-delta", (long long)(t1-t0));
 	}
+	arch_proc_sched_show_task(p, m);
 }
 
 void proc_sched_set_task(struct task_struct *p)



More information about the Linuxppc-dev mailing list