[PATCH 2/4] tickless idle cpu: Skip ticks when CPU is idle

Srivatsa Vaddagiri vatsa at in.ibm.com
Fri Apr 7 16:31:32 EST 2006


This is the core patch which skips ticks when a CPU is idle.
Should work on pSeries, pmac and maple machines.

The patch is against 2.6.17-rc1-mm1 and has been tested on a 16-way (with SMT) 
Power5 box (p570).

Signed-off-by: Srivatsa Vaddagiri <vatsa at in.ibm.com>

---

 linux-2.6.17-rc1-root/arch/powerpc/Kconfig                   |    6 
 linux-2.6.17-rc1-root/arch/powerpc/kernel/idle_power4.S      |    3 
 linux-2.6.17-rc1-root/arch/powerpc/kernel/irq.c              |    3 
 linux-2.6.17-rc1-root/arch/powerpc/kernel/time.c             |  150 ++++++++---
 linux-2.6.17-rc1-root/arch/powerpc/kernel/traps.c            |    1 
 linux-2.6.17-rc1-root/arch/powerpc/platforms/pseries/setup.c |    6 
 linux-2.6.17-rc1-root/include/asm-powerpc/time.h             |    8 
 7 files changed, 147 insertions(+), 30 deletions(-)

diff -puN arch/powerpc/kernel/time.c~no_idle_hz arch/powerpc/kernel/time.c
--- linux-2.6.17-rc1/arch/powerpc/kernel/time.c~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/kernel/time.c	2006-04-07 11:29:13.000000000 +0530
@@ -633,40 +633,12 @@ static void iSeries_tb_recal(void)
 }
 #endif
 
-/*
- * For iSeries shared processors, we have to let the hypervisor
- * set the hardware decrementer.  We set a virtual decrementer
- * in the lppaca and call the hypervisor if the virtual
- * decrementer is less than the current value in the hardware
- * decrementer. (almost always the new decrementer value will
- * be greater than the current hardware decementer so the hypervisor
- * call will not be needed)
- */
-
-/*
- * timer_interrupt - gets called when the decrementer overflows,
- * with interrupts disabled.
- */
-void timer_interrupt(struct pt_regs * regs)
+static void account_ticks(struct pt_regs *regs)
 {
 	int next_dec;
 	int cpu = smp_processor_id();
 	unsigned long ticks;
 
-#ifdef CONFIG_PPC32
-	if (atomic_read(&ppc_n_lost_interrupts) != 0)
-		do_IRQ(regs);
-#endif
-
-	irq_enter();
-
-	profile_tick(CPU_PROFILING, regs);
-	calculate_steal_time();
-
-#ifdef CONFIG_PPC_ISERIES
-	get_lppaca()->int_dword.fields.decr_int = 0;
-#endif
-
 	while ((ticks = tb_ticks_since(per_cpu(last_jiffy, cpu)))
 	       >= tb_ticks_per_jiffy) {
 		/* Update last_jiffy */
@@ -701,6 +673,123 @@ void timer_interrupt(struct pt_regs * re
 	
 	next_dec = tb_ticks_per_jiffy - ticks;
 	set_dec(next_dec);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+
+/* Returns 1 if this CPU was set in the mask */
+static inline int clear_hzless_mask(void)
+{
+	unsigned long cpu = smp_processor_id();
+	int rc = 0;
+
+	if (unlikely(cpu_isset(cpu, nohz_cpu_mask))) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		rc = 1;
+	}
+
+	return rc;
+}
+
+#define MAX_DEC_COUNT	UINT_MAX	/* Decrementer is 32-bit */
+static int min_skip = 2;		/* Minimum number of ticks to skip */
+static int max_skip;			/* Maximum number of ticks to skip */
+
+
+int sysctl_hz_timer = 1;
+
+/* Defer timer interrupt for as long as possible. This is accomplished by
+ * programming the decrementer to a suitable value such that it raises the
+ * exception after desired interval. This features allows CPUs to
+ * be used more efficiently in virtualized environments.
+ *
+ * Called with interrupts disabled on an idle CPU. Caller has to ensure that
+ * idle loop is not exited w/o start_hz_timer being called via an interrupt
+ * to restore timer interrupt frequency.
+ */
+
+void stop_hz_timer(void)
+{
+	unsigned long cpu = smp_processor_id(), seq, delta;
+	int next_dec;
+
+	if (sysctl_hz_timer != 0)
+		return;
+
+	cpu_set(cpu, nohz_cpu_mask);
+	mb();
+	if (rcu_pending(cpu) || local_softirq_pending()) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		return;
+	}
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		delta = next_timer_interrupt() - jiffies;
+
+		if (delta < min_skip) {
+			cpu_clear(cpu, nohz_cpu_mask);
+			return;
+		}
+
+		if (delta > max_skip)
+			delta = max_skip;
+
+		next_dec = tb_last_stamp + delta * tb_ticks_per_jiffy;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	next_dec -= get_tb();
+	set_dec(next_dec);
+
+	return;
+}
+
+/* Take into account skipped ticks and restore the HZ timer frequency */
+void start_hz_timer(struct pt_regs *regs)
+{
+	if (clear_hzless_mask())
+		account_ticks(regs);
+}
+
+#else
+static inline int clear_hzless_mask(void) { return 0;}
+#endif
+
+/*
+ * For iSeries shared processors, we have to let the hypervisor
+ * set the hardware decrementer.  We set a virtual decrementer
+ * in the lppaca and call the hypervisor if the virtual
+ * decrementer is less than the current value in the hardware
+ * decrementer. (almost always the new decrementer value will
+ * be greater than the current hardware decementer so the hypervisor
+ * call will not be needed)
+ */
+
+/*
+ * timer_interrupt - gets called when the decrementer overflows,
+ * with interrupts disabled.
+ */
+void timer_interrupt(struct pt_regs * regs)
+{
+#ifdef CONFIG_PPC32
+	if (atomic_read(&ppc_n_lost_interrupts) != 0)
+		do_IRQ(regs);
+#endif
+
+	irq_enter();
+
+	clear_hzless_mask();
+
+	profile_tick(CPU_PROFILING, regs);
+	calculate_steal_time();
+
+#ifdef CONFIG_PPC_ISERIES
+	get_lppaca()->int_dword.fields.decr_int = 0;
+#endif
+
+	account_ticks(regs);
 
 #ifdef CONFIG_PPC_ISERIES
 	if (hvlpevent_is_pending())
@@ -955,6 +1044,9 @@ void __init time_init(void)
 	tb_ticks_per_usec = ppc_tb_freq / 1000000;
 	tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
 	calc_cputime_factors();
+#ifdef CONFIG_NO_IDLE_HZ
+	max_skip = __USE_RTC() ? HZ : MAX_DEC_COUNT / tb_ticks_per_jiffy;
+#endif
 
 	/*
 	 * Calculate the length of each tick in ns.  It will not be
diff -puN arch/powerpc/kernel/irq.c~no_idle_hz arch/powerpc/kernel/irq.c
--- linux-2.6.17-rc1/arch/powerpc/kernel/irq.c~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/kernel/irq.c	2006-04-07 04:14:57.000000000 +0530
@@ -60,6 +60,7 @@
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/paca.h>
 #endif
+#include <asm/time.h>
 
 int __irq_offset_value;
 #ifdef CONFIG_PPC32
@@ -189,6 +190,8 @@ void do_IRQ(struct pt_regs *regs)
 
         irq_enter();
 
+	start_hz_timer(regs);
+
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 2KB free? */
 	{
diff -puN include/asm-powerpc/time.h~no_idle_hz include/asm-powerpc/time.h
--- linux-2.6.17-rc1/include/asm-powerpc/time.h~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/include/asm-powerpc/time.h	2006-04-07 04:14:58.000000000 +0530
@@ -198,6 +198,14 @@ static inline unsigned long tb_ticks_sin
 	return get_tbl() - tstamp;
 }
 
+#ifdef CONFIG_NO_IDLE_HZ
+extern void stop_hz_timer(void);
+extern void start_hz_timer(struct pt_regs *);
+#else
+static inline void stop_hz_timer(void) { }
+static inline void start_hz_timer(struct pt_regs *regs) { }
+#endif
+
 #define mulhwu(x,y) \
 ({unsigned z; asm ("mulhwu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;})
 
diff -puN arch/powerpc/Kconfig~no_idle_hz arch/powerpc/Kconfig
--- linux-2.6.17-rc1/arch/powerpc/Kconfig~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/Kconfig	2006-04-07 04:14:58.000000000 +0530
@@ -593,6 +593,12 @@ config HOTPLUG_CPU
 
 	  Say N if you are unsure.
 
+config NO_IDLE_HZ
+	depends on EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC || PPC_MAPLE)
+	bool "Switch off timer ticks on idle CPUs"
+	help
+	  Switches the HZ timer interrupts off when a CPU is idle.
+
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on PPC_MULTIPLATFORM && EXPERIMENTAL
diff -puN arch/powerpc/kernel/traps.c~no_idle_hz arch/powerpc/kernel/traps.c
--- linux-2.6.17-rc1/arch/powerpc/kernel/traps.c~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/kernel/traps.c	2006-04-07 04:14:58.000000000 +0530
@@ -875,6 +875,7 @@ void altivec_unavailable_exception(struc
 
 void performance_monitor_exception(struct pt_regs *regs)
 {
+	start_hz_timer(regs);
 	perf_irq(regs);
 }
 
diff -puN arch/powerpc/platforms/pseries/setup.c~no_idle_hz arch/powerpc/platforms/pseries/setup.c
--- linux-2.6.17-rc1/arch/powerpc/platforms/pseries/setup.c~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/platforms/pseries/setup.c	2006-04-07 04:15:50.000000000 +0530
@@ -463,8 +463,10 @@ static void pseries_dedicated_idle_sleep
 	 * very low priority.  The cede enables interrupts, which
 	 * doesn't matter here.
 	 */
-	if (!lppaca[cpu ^ 1].idle || poll_pending() == H_PENDING)
+	if (!lppaca[cpu ^ 1].idle || poll_pending() == H_PENDING) {
+		stop_hz_timer();
 		cede_processor();
+	}
 
 out:
 	HMT_medium();
@@ -479,6 +481,8 @@ static void pseries_shared_idle_sleep(vo
 	 */
 	get_lppaca()->idle = 1;
 
+	stop_hz_timer();
+
 	/*
 	 * Yield the processor to the hypervisor.  We return if
 	 * an external interrupt occurs (which are driven prior
diff -puN arch/powerpc/kernel/idle_power4.S~no_idle_hz arch/powerpc/kernel/idle_power4.S
--- linux-2.6.17-rc1/arch/powerpc/kernel/idle_power4.S~no_idle_hz	2006-04-07 04:14:39.000000000 +0530
+++ linux-2.6.17-rc1-root/arch/powerpc/kernel/idle_power4.S	2006-04-07 04:14:58.000000000 +0530
@@ -30,6 +30,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP)
 	cmpwi	0,r4,0
 	beqlr
 
+	mflr	r4
+	bl	.stop_hz_timer
+	mtlr	r4
 	/* Go to NAP now */
 BEGIN_FTR_SECTION
 	DSSALL

_


-- 
Regards,
vatsa



More information about the Linuxppc-dev mailing list