[PATCH] NO_IDLE_HZ patch updated to 2.6.15-rc3-mm1

Srivatsa Vaddagiri vatsa at in.ibm.com
Fri Dec 2 01:26:19 EST 2005


Hello,
	Here's updated patch to implement NO_IDLE_HZ on PPC64.
The patch is against 2.6.15-rc3-mm1 and has been tested on a Power5 LPAR.

The patches attached are:

boot_cpu_fix.patch	-> Lets do_timer be called from any CPU
no_idle_hz.patch	-> Implement tickless idle CPUs for PPC64
debug.patch		-> Debug patch that I used for getting
			   decrementer statistics. We need more
		 	   cleaner solution if we have to expose
			   those statistics.

Let me know if you have any comments on these patches.

-- 


Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017
-------------- next part --------------
Currently xtime/jiffies is updated by only boot CPU which makes
it difficult for an idle boot CPU to skip ticks. The patch overcomes this
limitation and lets xtime/jiffies be updated from any CPU.


Signed-off-by: Srivatsa Vaddagiri <vatsa at in.ibm.com>

---


diff -puN arch/powerpc/kernel/time.c~boot_cpu_fix arch/powerpc/kernel/time.c
--- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/time.c~boot_cpu_fix	2005-12-01 13:14:55.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/time.c	2005-12-01 13:21:52.000000000 -0800
@@ -420,6 +420,7 @@ void timer_interrupt(struct pt_regs * re
 	int next_dec;
 	int cpu = smp_processor_id();
 	unsigned long ticks;
+	int end_singleshot = 0;
 
 #ifdef CONFIG_PPC32
 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
@@ -452,23 +453,29 @@ void timer_interrupt(struct pt_regs * re
 		if (!cpu_is_offline(cpu))
 			update_process_times(user_mode(regs));
 
-		/*
-		 * No need to check whether cpu is offline here; boot_cpuid
-		 * should have been fixed up by now.
-		 */
-		if (cpu != boot_cpuid)
-			continue;
-
 		write_seqlock(&xtime_lock);
-		tb_last_jiffy += tb_ticks_per_jiffy;
-		tb_last_stamp = per_cpu(last_jiffy, cpu);
-		timer_recalc_offset(tb_last_jiffy);
-		do_timer(regs);
-		timer_sync_xtime(tb_last_jiffy);
-		timer_check_rtc();
+		if (tb_ticks_since(tb_last_stamp) >= tb_ticks_per_jiffy) {
+			tb_last_jiffy += tb_ticks_per_jiffy;
+			tb_last_stamp += tb_ticks_per_jiffy;
+			if (__USE_RTC() && tb_last_stamp >= 1000000000)
+			 	tb_last_stamp -= 1000000000;
+			timer_recalc_offset(tb_last_jiffy);
+			do_timer(regs);
+			timer_sync_xtime(tb_last_jiffy);
+			timer_check_rtc();
+		}
+		if (adjusting_time && (time_adjust == 0)) {
+			adjusting_time = 0;
+			end_singleshot = 1;
+		}
 		write_sequnlock(&xtime_lock);
-		if (adjusting_time && (time_adjust == 0))
+
+		if (end_singleshot) {
+#ifdef DEBUG_PPC_ADJTIMEX
+			printk("ppc_adjtimex: ending single shot time_adjust\n");
+#endif
 			ppc_adjtimex();
+		}
 	}
 	
 	next_dec = tb_ticks_per_jiffy - ticks;
@@ -826,13 +833,6 @@ void ppc_adjtimex(void)
 		if ( time_adjust < 0 )
 			singleshot_ppm = -singleshot_ppm;
 	}
-	else {
-#ifdef DEBUG_PPC_ADJTIMEX
-		if ( adjusting_time )
-			printk("ppc_adjtimex: ending single shot time_adjust\n");
-#endif
-		adjusting_time = 0;
-	}
 	
 	/* Add up all of the frequency adjustments */
 	delta_freq = time_freq + ltemp + singleshot_ppm;

_
-------------- next part --------------
This patch causes idle CPUs to skip timer ticks until the next scheduled
event (next_timer_interrupt()) or until some max duration allowed by the
decrementer. This helps to conserve power and on virtual partitions using
shared processors, allows for efficient CPU utilization.

Currently, only few idle routines have been converted over to use
this feature. Other idle routine could be converted over later
depending on the requirement.

Signed-off-by : Srivatsa Vaddagiri <vatsa at in.ibm.com>


---


diff -puN arch/powerpc/kernel/time.c~no_idle_hz arch/powerpc/kernel/time.c
--- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/time.c~no_idle_hz	2005-12-01 16:06:28.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/time.c	2005-12-01 16:18:52.000000000 -0800
@@ -401,40 +401,13 @@ static void iSeries_tb_recal(void)
 }
 #endif
 
-/*
- * For iSeries shared processors, we have to let the hypervisor
- * set the hardware decrementer.  We set a virtual decrementer
- * in the lppaca and call the hypervisor if the virtual
- * decrementer is less than the current value in the hardware
- * decrementer. (almost always the new decrementer value will
- * be greater than the current hardware decementer so the hypervisor
- * call will not be needed)
- */
-
-/*
- * timer_interrupt - gets called when the decrementer overflows,
- * with interrupts disabled.
- */
-void timer_interrupt(struct pt_regs * regs)
+static void account_ticks(struct pt_regs *regs)
 {
 	int next_dec;
 	int cpu = smp_processor_id();
 	unsigned long ticks;
 	int end_singleshot = 0;
 
-#ifdef CONFIG_PPC32
-	if (atomic_read(&ppc_n_lost_interrupts) != 0)
-		do_IRQ(regs);
-#endif
-
-	irq_enter();
-
-	profile_tick(CPU_PROFILING, regs);
-
-#ifdef CONFIG_PPC_ISERIES
-	get_paca()->lppaca.int_dword.fields.decr_int = 0;
-#endif
-
 	while ((ticks = tb_ticks_since(per_cpu(last_jiffy, cpu)))
 	       >= tb_ticks_per_jiffy) {
 		/* Update last_jiffy */
@@ -480,6 +453,58 @@ void timer_interrupt(struct pt_regs * re
 	
 	next_dec = tb_ticks_per_jiffy - ticks;
 	set_dec(next_dec);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+/* Returns 1 if this CPU was set in the mask */
+static inline int clear_hzless_mask(void)
+{
+	unsigned long cpu = smp_processor_id();
+	int rc = 0;
+
+	if (unlikely(cpu_isset(cpu, nohz_cpu_mask))) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		rc = 1;
+	}
+
+	return rc;
+}
+#else
+static inline int clear_hzless_mask(void) { return 0;}
+#endif
+
+/*
+ * For iSeries shared processors, we have to let the hypervisor
+ * set the hardware decrementer.  We set a virtual decrementer
+ * in the lppaca and call the hypervisor if the virtual
+ * decrementer is less than the current value in the hardware
+ * decrementer. (almost always the new decrementer value will
+ * be greater than the current hardware decementer so the hypervisor
+ * call will not be needed)
+ */
+
+/*
+ * timer_interrupt - gets called when the decrementer overflows,
+ * with interrupts disabled.
+ */
+void timer_interrupt(struct pt_regs * regs)
+{
+#ifdef CONFIG_PPC32
+	if (atomic_read(&ppc_n_lost_interrupts) != 0)
+		do_IRQ(regs);
+#endif
+
+	irq_enter();
+
+	clear_hzless_mask();
+
+	profile_tick(CPU_PROFILING, regs);
+
+#ifdef CONFIG_PPC_ISERIES
+	get_paca()->lppaca.int_dword.fields.decr_int = 0;
+#endif
+
+	account_ticks(regs);
 
 #ifdef CONFIG_PPC_ISERIES
 	if (hvlpevent_is_pending())
@@ -497,6 +522,72 @@ void timer_interrupt(struct pt_regs * re
 	irq_exit();
 }
 
+#ifdef CONFIG_NO_IDLE_HZ
+
+#define MAX_DEC_COUNT	(UINT_MAX)	/* Decrementer is 32-bit */
+#define MIN_SKIP	2
+#define MAX_SKIP	(MAX_DEC_COUNT/tb_ticks_per_jiffy)
+
+int sysctl_hz_timer = 1;
+
+/* Avoid the HZ timer (decrementer) interrupt on this CPU for "some" time.
+ * This is accomplished by loading the decrementer with some large calculated
+ * value. The CPU exits this "tickless" state upon the occurence of an
+ * exception or external interrupt, at which point the decrementer is again
+ * reprogrammed to restore the timer interrupt frequency (see start_hz_timer).
+ * Caller has to ensure that the CPU does not exit the "tickless" idle state
+ * via other means.
+ *
+ * Has to be called with interrupts disabled.
+ */
+void stop_hz_timer(void)
+{
+	unsigned long cpu = smp_processor_id(), seq, delta;
+	int next_dec;
+
+	if (sysctl_hz_timer != 0)
+		return;
+
+	cpu_set(cpu, nohz_cpu_mask);
+	smp_mb();
+	if (rcu_pending(cpu) || local_softirq_pending()) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		return;
+	}
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		delta = next_timer_interrupt() - jiffies;
+
+		if (delta < MIN_SKIP) {
+			cpu_clear(cpu, nohz_cpu_mask);
+			return;
+		}
+
+		if (delta > MAX_SKIP)
+			delta = MAX_SKIP;
+
+		next_dec = tb_last_stamp + delta * tb_ticks_per_jiffy;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	next_dec -= get_tbl();
+	set_dec(next_dec);
+
+	return;
+}
+
+/* Take into account skipped ticks and restore the HZ timer frequency */
+void start_hz_timer(struct pt_regs *regs)
+{
+	if (clear_hzless_mask())
+		account_ticks(regs);
+}
+
+#endif	/* CONFIG_NO_IDLE_HZ */
+
+
 void wakeup_decrementer(void)
 {
 	int i;
diff -puN arch/powerpc/kernel/irq.c~no_idle_hz arch/powerpc/kernel/irq.c
--- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/irq.c~no_idle_hz	2005-12-01 16:06:28.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/irq.c	2005-12-01 16:18:52.000000000 -0800
@@ -59,6 +59,7 @@
 #include <asm/prom.h>
 #include <asm/ptrace.h>
 #include <asm/machdep.h>
+#include <asm/time.h>
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/paca.h>
 #endif
@@ -192,6 +193,8 @@ void do_IRQ(struct pt_regs *regs)
 
         irq_enter();
 
+	start_hz_timer(regs);
+
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 2KB free? */
 	{
diff -puN include/asm-powerpc/time.h~no_idle_hz include/asm-powerpc/time.h
--- linux-2.6.15-rc3-mm1/include/asm-powerpc/time.h~no_idle_hz	2005-12-01 16:06:39.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/include/asm-powerpc/time.h	2005-12-01 16:06:39.000000000 -0800
@@ -198,6 +198,14 @@ static inline unsigned long tb_ticks_sin
 	return get_tbl() - tstamp;
 }
 
+#ifdef CONFIG_NO_IDLE_HZ
+extern void stop_hz_timer(void);
+extern void start_hz_timer(struct pt_regs *);
+#else
+static inline void stop_hz_timer(void) { }
+static inline void start_hz_timer(struct pt_regs *regs) { }
+#endif
+
 #define mulhwu(x,y) \
 ({unsigned z; asm ("mulhwu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;})
 
diff -puN arch/powerpc/Kconfig~no_idle_hz arch/powerpc/Kconfig
--- linux-2.6.15-rc3-mm1/arch/powerpc/Kconfig~no_idle_hz	2005-12-01 16:06:28.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/Kconfig	2005-12-01 16:06:28.000000000 -0800
@@ -532,6 +532,12 @@ config HOTPLUG_CPU
 
 	  Say N if you are unsure.
 
+config NO_IDLE_HZ
+	depends on EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC || PPC_MAPLE)
+	bool "Skip timer ticks on idle CPUs (EXPERIMENTAL)"
+	help
+	  Switches the HZ timer interrupts off when a CPU is idle.
+
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on PPC_MULTIPLATFORM && EXPERIMENTAL
diff -puN kernel/sysctl.c~no_idle_hz kernel/sysctl.c
--- linux-2.6.15-rc3-mm1/kernel/sysctl.c~no_idle_hz	2005-12-01 16:06:36.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/kernel/sysctl.c	2005-12-01 16:06:36.000000000 -0800
@@ -542,6 +542,16 @@ static ctl_table kern_table[] = {
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
+#ifdef CONFIG_NO_IDLE_HZ
+	{
+		.ctl_name       = KERN_HZ_TIMER,
+		.procname       = "hz_timer",
+		.data           = &sysctl_hz_timer,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_ARCH_S390
 #ifdef CONFIG_MATHEMU
 	{
@@ -553,16 +563,6 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_NO_IDLE_HZ
-	{
-		.ctl_name       = KERN_HZ_TIMER,
-		.procname       = "hz_timer",
-		.data           = &sysctl_hz_timer,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
-	},
-#endif
 	{
 		.ctl_name	= KERN_S390_USER_DEBUG_LOGGING,
 		.procname	= "userprocess_debug",
diff -puN arch/powerpc/platforms/pseries/setup.c~no_idle_hz arch/powerpc/platforms/pseries/setup.c
--- linux-2.6.15-rc3-mm1/arch/powerpc/platforms/pseries/setup.c~no_idle_hz	2005-12-01 16:06:28.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/platforms/pseries/setup.c	2005-12-01 16:18:52.000000000 -0800
@@ -461,9 +461,10 @@ static inline void dedicated_idle_sleep(
 		 * a prod occurs.  Returning from the cede enables external
 		 * interrupts.
 		 */
-		if (!need_resched())
+		if (!need_resched()) {
+			stop_hz_timer();
 			cede_processor();
-		else
+		} else
 			local_irq_enable();
 		set_thread_flag(TIF_POLLING_NRFLAG);
 	} else {
@@ -553,9 +554,10 @@ static void pseries_shared_idle(void)
 			 * Check need_resched() again with interrupts disabled
 			 * to avoid a race.
 			 */
-			if (!need_resched())
+			if (!need_resched()) {
+				stop_hz_timer();
 				cede_processor();
-			else
+			} else
 				local_irq_enable();
 
 			HMT_medium();
diff -puN arch/powerpc/kernel/traps.c~no_idle_hz arch/powerpc/kernel/traps.c
--- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/traps.c~no_idle_hz	2005-12-01 16:06:28.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/traps.c	2005-12-01 16:06:28.000000000 -0800
@@ -40,6 +40,7 @@
 #include <asm/machdep.h>
 #include <asm/rtas.h>
 #include <asm/pmc.h>
+#include <asm/time.h>
 #ifdef CONFIG_PPC32
 #include <asm/reg.h>
 #endif
@@ -889,6 +890,7 @@ void altivec_unavailable_exception(struc
 #if defined(CONFIG_PPC64) || defined(CONFIG_E500)
 void performance_monitor_exception(struct pt_regs *regs)
 {
+	start_hz_timer(regs);
 	perf_irq(regs);
 }
 #endif
diff -puN arch/powerpc/kernel/idle_64.c~no_idle_hz arch/powerpc/kernel/idle_64.c
--- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/idle_64.c~no_idle_hz	2005-12-01 16:06:28.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/idle_64.c	2005-12-01 16:18:52.000000000 -0800
@@ -66,8 +66,12 @@ void native_idle(void)
 	while (1) {
 		ppc64_runlatch_off();
 
-		if (!need_resched())
-			power4_idle();
+ 		local_irq_disable();
+ 		if (!need_resched()) {
+ 			stop_hz_timer();
+ 			local_irq_enable();
+ 			power4_idle();
+ 		}
 
 		if (need_resched()) {
 			ppc64_runlatch_on();

_
-------------- next part --------------
This patch is a quick hack to get decrementer statistics. Not meant
for inclusion.

---


diff -puN arch/powerpc/kernel/time.c~debug arch/powerpc/kernel/time.c
--- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/time.c~debug	2005-12-01 16:19:07.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/time.c	2005-12-01 16:19:07.000000000 -0800
@@ -489,6 +489,8 @@ static inline int clear_hzless_mask(void
  */
 void timer_interrupt(struct pt_regs * regs)
 {
+	int cpu = smp_processor_id();
+
 #ifdef CONFIG_PPC32
 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
 		do_IRQ(regs);
@@ -498,6 +500,8 @@ void timer_interrupt(struct pt_regs * re
 
 	clear_hzless_mask();
 
+	kstat_cpu(cpu).irqs[0]++;
+
 	profile_tick(CPU_PROFILING, regs);
 
 #ifdef CONFIG_PPC_ISERIES
@@ -548,6 +552,9 @@ void stop_hz_timer(void)
 	if (sysctl_hz_timer != 0)
 		return;
 
+	if (cpu_isset(cpu, nohz_cpu_mask))
+		return;
+
 	cpu_set(cpu, nohz_cpu_mask);
 	smp_mb();
 	if (rcu_pending(cpu) || local_softirq_pending()) {
diff -puN arch/powerpc/kernel/idle_64.c~debug arch/powerpc/kernel/idle_64.c
--- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/idle_64.c~debug	2005-12-01 16:19:07.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/idle_64.c	2005-12-01 16:19:07.000000000 -0800
@@ -41,6 +41,11 @@ void default_idle(void)
 			while (!need_resched() && !cpu_is_offline(cpu)) {
 				ppc64_runlatch_off();
 
+				local_irq_disable();
+				if (!need_resched())
+					stop_hz_timer();
+				local_irq_enable();
+
 				/*
 				 * Go into low thread priority and possibly
 				 * low power mode.
diff -puN arch/powerpc/kernel/irq.c~debug arch/powerpc/kernel/irq.c
--- linux-2.6.15-rc3-mm1/arch/powerpc/kernel/irq.c~debug	2005-12-01 16:19:07.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/kernel/irq.c	2005-12-01 16:19:07.000000000 -0800
@@ -107,6 +107,10 @@ int show_interrupts(struct seq_file *p, 
 		for_each_online_cpu(j)
 			seq_printf(p, "CPU%d       ", j);
 		seq_putc(p, '\n');
+		seq_printf(p, "%3d: ", i);
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+		seq_putc(p, '\n');
 	}
 
 	if (i < NR_IRQS) {
diff -puN arch/powerpc/platforms/pseries/setup.c~debug arch/powerpc/platforms/pseries/setup.c
--- linux-2.6.15-rc3-mm1/arch/powerpc/platforms/pseries/setup.c~debug	2005-12-01 16:19:07.000000000 -0800
+++ linux-2.6.15-rc3-mm1-root/arch/powerpc/platforms/pseries/setup.c	2005-12-01 16:19:07.000000000 -0800
@@ -498,6 +498,11 @@ static void pseries_dedicated_idle(void)
 			while (!need_resched() && !cpu_is_offline(cpu)) {
 				ppc64_runlatch_off();
 
+				local_irq_disable();
+				if (!need_resched())
+					stop_hz_timer();
+				local_irq_enable();
+
 				/*
 				 * Go into low thread priority and possibly
 				 * low power mode.

_


More information about the Linuxppc64-dev mailing list