[PATCH] NO_IDLE_HZ implementation for ppc64 - v2

Srivatsa Vaddagiri vatsa at in.ibm.com
Thu Oct 6 04:01:42 EST 2005


Ben,
	Here's the revised version, taking into account some of the
comments you had. Changes since last time:

- native_idle also converted over
- Fixed a bug in calculation of next_dec in stop_hz_timer
- Removed call to start_hz_timer from head.S
- Added a call to start_hz_timer in performance_monitor_exception

This has been tested against 2.6.14-rc1 on a 4way Power4 box (p630)
with some additional patch (the same test patch I had sent earlier  
which showed decrementer statistics in /proc).

I will rebase this patch against latest -mm if you think this is in
the right direction.



Signed-off-by: Srivatsa Vaddagiri <vatsa at in.ibm.com>

---

 linux-2.6.14-rc1-root/arch/ppc64/Kconfig                |    6 
 linux-2.6.14-rc1-root/arch/ppc64/kernel/idle.c          |    6 
 linux-2.6.14-rc1-root/arch/ppc64/kernel/irq.c           |    3 
 linux-2.6.14-rc1-root/arch/ppc64/kernel/pSeries_setup.c |   10 -
 linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c          |  112 ++++++++++++++--
 linux-2.6.14-rc1-root/arch/ppc64/kernel/traps.c         |    2 
 linux-2.6.14-rc1-root/include/asm-ppc64/time.h          |    8 +
 linux-2.6.14-rc1-root/kernel/sysctl.c                   |   20 +-
 8 files changed, 141 insertions(+), 26 deletions(-)

diff -puN arch/ppc64/kernel/time.c~ppc64 arch/ppc64/kernel/time.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/time.c~ppc64	2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c	2005-10-05 16:34:51.000000000 +0530
@@ -315,23 +315,13 @@ static void iSeries_tb_recal(void)
 
 unsigned long tb_last_stamp __cacheline_aligned_in_smp;
 
-/*
- * timer_interrupt - gets called when the decrementer overflows,
- * with interrupts disabled.
- */
-int timer_interrupt(struct pt_regs * regs)
+static void account_ticks(struct pt_regs *regs)
 {
 	int next_dec;
 	unsigned long cur_tb;
 	struct paca_struct *lpaca = get_paca();
 	unsigned long cpu = smp_processor_id();
 
-	irq_enter();
-
-	profile_tick(CPU_PROFILING, regs);
-
-	lpaca->lppaca.int_dword.fields.decr_int = 0;
-
 	while (lpaca->next_jiffy_update_tb <= (cur_tb = get_tb())) {
 		/*
 		 * We cannot disable the decrementer, so in the period
@@ -364,6 +354,43 @@ int timer_interrupt(struct pt_regs * reg
 	if (next_dec > lpaca->default_decr)
         	next_dec = lpaca->default_decr;
 	set_dec(next_dec);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+/* Returns 1 if this CPU was set in the mask */
+static inline int clear_hzless_mask(void)
+{
+	unsigned long cpu = smp_processor_id();
+	int rc = 0;
+
+	if (unlikely(cpu_isset(cpu, nohz_cpu_mask))) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		rc = 1;
+	}
+
+	return rc;
+}
+#else
+static inline int clear_hzless_mask(void) { return 0;}
+#endif
+
+/*
+ * timer_interrupt - gets called when the decrementer overflows,
+ * with interrupts disabled.
+ */
+int timer_interrupt(struct pt_regs * regs)
+{
+	struct paca_struct *lpaca = get_paca();
+
+	irq_enter();
+
+	clear_hzless_mask();
+
+	profile_tick(CPU_PROFILING, regs);
+
+	lpaca->lppaca.int_dword.fields.decr_int = 0;
+
+	account_ticks(regs);
 
 #ifdef CONFIG_PPC_ISERIES
 	if (hvlpevent_is_pending())
@@ -381,6 +408,69 @@ int timer_interrupt(struct pt_regs * reg
 	return 1;
 }
 
+#ifdef CONFIG_NO_IDLE_HZ
+
+#define MAX_DEC_COUNT	(UINT_MAX)	/* Decrementer is 32-bit */
+#define MIN_SKIP	2
+#define MAX_SKIP	(MAX_DEC_COUNT/tb_ticks_per_jiffy)
+
+int sysctl_hz_timer = 1;
+
+/* Avoid the HZ timer (decrementer) interrupt on this CPU for "some" time.
+ * Has to be called with interrupts disabled.
+ *
+ * The HZ timer frequency is restored upon the occurence of an interrupt or
+ * exception on this CPU. Caller has to ensure that the CPU doesnt exit
+ * idle mode via other means.
+ */
+void stop_hz_timer(void)
+{
+	unsigned long cpu = smp_processor_id(), seq, delta;
+	int next_dec;
+
+	if (sysctl_hz_timer != 0)
+		return;
+
+	cpu_set(cpu, nohz_cpu_mask);
+	mb();
+	if (rcu_pending(cpu) || local_softirq_pending()) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		return;
+	}
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		delta = next_timer_interrupt() - jiffies;
+
+		if (delta < MIN_SKIP) {
+			cpu_clear(cpu, nohz_cpu_mask);
+			return;
+		}
+
+		if (delta > MAX_SKIP)
+			delta = MAX_SKIP;
+
+		next_dec = tb_last_stamp + delta * tb_ticks_per_jiffy;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	next_dec -= get_tb();
+	set_dec(next_dec);
+
+	return;
+}
+
+/* Take into account skipped ticks and restore the HZ timer frequency */
+void start_hz_timer(struct pt_regs *regs)
+{
+	if (clear_hzless_mask())
+		account_ticks(regs);
+}
+
+#endif	/* CONFIG_NO_IDLE_HZ */
+
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  *
diff -puN arch/ppc64/kernel/irq.c~ppc64 arch/ppc64/kernel/irq.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/irq.c~ppc64	2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/irq.c	2005-10-05 16:33:06.000000000 +0530
@@ -55,6 +55,7 @@
 #include <asm/iSeries/ItLpQueue.h>
 #include <asm/machdep.h>
 #include <asm/paca.h>
+#include <asm/time.h>
 
 #ifdef CONFIG_SMP
 extern void iSeries_smp_message_recv( struct pt_regs * );
@@ -313,6 +314,8 @@ void do_IRQ(struct pt_regs *regs)
 
 	irq_enter();
 
+	start_hz_timer(regs);
+
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 2KB free? */
 	{
diff -puN include/asm-ppc64/time.h~ppc64 include/asm-ppc64/time.h
--- linux-2.6.14-rc1/include/asm-ppc64/time.h~ppc64	2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/include/asm-ppc64/time.h	2005-10-05 16:33:06.000000000 +0530
@@ -102,6 +102,14 @@ static inline unsigned long tb_ticks_sin
 	return get_tb() - tstamp;
 }
 
+#ifdef CONFIG_NO_IDLE_HZ
+extern void stop_hz_timer(void);
+extern void start_hz_timer(struct pt_regs *);
+#else
+static inline void stop_hz_timer(void) { }
+static inline void start_hz_timer(struct pt_regs *regs) { }
+#endif
+
 #define mulhwu(x,y) \
 ({unsigned z; asm ("mulhwu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;})
 #define mulhdu(x,y) \
diff -puN arch/ppc64/Kconfig~ppc64 arch/ppc64/Kconfig
--- linux-2.6.14-rc1/arch/ppc64/Kconfig~ppc64	2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/Kconfig	2005-10-05 16:33:06.000000000 +0530
@@ -146,6 +146,12 @@ config PPC_SPLPAR
 	  processors, that is, which share physical processors between
 	  two or more partitions.
 
+config NO_IDLE_HZ
+	depends on EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC || PPC_MAPLE)
+	bool "No HZ timer ticks in idle"
+	help
+	  Switches the HZ timer interrupts off when a CPU is idle.
+
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on PPC_MULTIPLATFORM && EXPERIMENTAL
diff -puN kernel/sysctl.c~ppc64 kernel/sysctl.c
--- linux-2.6.14-rc1/kernel/sysctl.c~ppc64	2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/kernel/sysctl.c	2005-10-05 16:33:06.000000000 +0530
@@ -544,6 +544,16 @@ static ctl_table kern_table[] = {
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
+#ifdef CONFIG_NO_IDLE_HZ
+	{
+		.ctl_name       = KERN_HZ_TIMER,
+		.procname       = "hz_timer",
+		.data           = &sysctl_hz_timer,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_ARCH_S390
 #ifdef CONFIG_MATHEMU
 	{
@@ -555,16 +565,6 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_NO_IDLE_HZ
-	{
-		.ctl_name       = KERN_HZ_TIMER,
-		.procname       = "hz_timer",
-		.data           = &sysctl_hz_timer,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
-	},
-#endif
 	{
 		.ctl_name	= KERN_S390_USER_DEBUG_LOGGING,
 		.procname	= "userprocess_debug",
diff -puN arch/ppc64/kernel/pSeries_setup.c~ppc64 arch/ppc64/kernel/pSeries_setup.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/pSeries_setup.c~ppc64	2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/pSeries_setup.c	2005-10-05 16:33:06.000000000 +0530
@@ -475,9 +475,10 @@ static inline void dedicated_idle_sleep(
 		 * a prod occurs.  Returning from the cede enables external
 		 * interrupts.
 		 */
-		if (!need_resched())
+		if (!need_resched()) {
+			stop_hz_timer();
 			cede_processor();
-		else
+		} else
 			local_irq_enable();
 	} else {
 		/*
@@ -570,9 +571,10 @@ static int pseries_shared_idle(void)
 			 * Check need_resched() again with interrupts disabled
 			 * to avoid a race.
 			 */
-			if (!need_resched())
+			if (!need_resched()) {
+				stop_hz_timer();
 				cede_processor();
-			else
+			} else
 				local_irq_enable();
 
 			HMT_medium();
diff -puN arch/ppc64/kernel/traps.c~ppc64 arch/ppc64/kernel/traps.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/traps.c~ppc64	2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/traps.c	2005-10-05 16:33:06.000000000 +0530
@@ -43,6 +43,7 @@
 #include <asm/systemcfg.h>
 #include <asm/machdep.h>
 #include <asm/pmc.h>
+#include <asm/time.h>
 
 #ifdef CONFIG_DEBUGGER
 int (*__debugger)(struct pt_regs *regs);
@@ -470,6 +471,7 @@ extern perf_irq_t perf_irq;
 
 void performance_monitor_exception(struct pt_regs *regs)
 {
+	start_hz_timer(regs);
 	perf_irq(regs);
 }
 
diff -puN arch/ppc64/kernel/idle.c~ppc64 arch/ppc64/kernel/idle.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/idle.c~ppc64	2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/idle.c	2005-10-05 16:33:06.000000000 +0530
@@ -73,8 +73,12 @@ int native_idle(void)
 	while (1) {
 		ppc64_runlatch_off();
 
-		if (!need_resched())
+		local_irq_disable();
+		if (!need_resched()) {
+			stop_hz_timer();
+			local_irq_enable();
 			power4_idle();
+		}
 
 		if (need_resched()) {
 			ppc64_runlatch_on();

_
-- 


Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017



More information about the Linuxppc64-dev mailing list