[PATCH] NO_IDLE_HZ implementation for PPC64

Srivatsa Vaddagiri vatsa at in.ibm.com
Mon Oct 3 03:46:30 EST 2005


Hello,
	The patch below implements NO_IDLE_HZ support for pSeries/PPC64. It
basically lets idle CPUs to cut off their timer ticks until they can.

Some notes about the patch:

- Patch is against 2.6.14-rc1 and has undergone some basic test
  (with an additional patch - also in the mail) on a Power4 box. I intend to 
  test on a Power5 box also sometime soon.

- Only pseries_shared_idle and pseries_dedicated_idle routines
  have been converted over to use this support, since I felt 
  cutting off ticks doesnt make sense if the idle routine is
  poll-based.

- Boot CPU cannot skip ticks. This is because of the current design wherein
  only boot CPU updates wall-clock/jiffies.

  I didn't see any particular reason why it has been designed like that
  (maybe to reduce lock contention on xtime_lock?). If we have to allow
  boot CPU also to skip ticks (which IMO we should), then this design
  needs to change, i.e we should allow xtime/jiffies to be updated
  from any CPU (like S390 allows). If people agree that this is the 
  right direction, then I can give it a shot next.

- By default the feature is disabled at bootup and has to be enabled
  by writing 0 to /proc/sys/kernel/hz_timer. This can be modifed
  later after the patch has undergone sufficient test. Also we can
  introduce a boottime argument to control this behaviour.

- One requirement is that a call to start_hz_timer should be inserted
  in every possible interrupt path. Towards this end, have I missed 
  some interrupt paths? Or have I included some exception path which
  I shouldn't be!


Below are both the patches - actual patch and the patch which I used to
test on Power4 box.

First the actual NO_IDLE_HZ patch:

---

 linux-2.6.14-rc1-root/arch/ppc64/Kconfig                |    6 +
 linux-2.6.14-rc1-root/arch/ppc64/kernel/head.S          |    9 +
 linux-2.6.14-rc1-root/arch/ppc64/kernel/irq.c           |    3 
 linux-2.6.14-rc1-root/arch/ppc64/kernel/pSeries_setup.c |   10 +
 linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c          |   96 ++++++++++++++--
 linux-2.6.14-rc1-root/include/asm-ppc64/time.h          |    8 +
 linux-2.6.14-rc1-root/kernel/sysctl.c                   |   20 +--
 7 files changed, 127 insertions(+), 25 deletions(-)

diff -puN arch/ppc64/kernel/time.c~ppc64 arch/ppc64/kernel/time.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/time.c~ppc64	2005-09-28 19:35:36.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c	2005-10-02 22:53:44.000000000 +0530
@@ -315,23 +315,13 @@ static void iSeries_tb_recal(void)
 
 unsigned long tb_last_stamp __cacheline_aligned_in_smp;
 
-/*
- * timer_interrupt - gets called when the decrementer overflows,
- * with interrupts disabled.
- */
-int timer_interrupt(struct pt_regs * regs)
+static void account_ticks(struct pt_regs *regs)
 {
 	int next_dec;
 	unsigned long cur_tb;
 	struct paca_struct *lpaca = get_paca();
 	unsigned long cpu = smp_processor_id();
 
-	irq_enter();
-
-	profile_tick(CPU_PROFILING, regs);
-
-	lpaca->lppaca.int_dword.fields.decr_int = 0;
-
 	while (lpaca->next_jiffy_update_tb <= (cur_tb = get_tb())) {
 		/*
 		 * We cannot disable the decrementer, so in the period
@@ -364,6 +354,23 @@ int timer_interrupt(struct pt_regs * reg
 	if (next_dec > lpaca->default_decr)
         	next_dec = lpaca->default_decr;
 	set_dec(next_dec);
+}
+
+/*
+ * timer_interrupt - gets called when the decrementer overflows,
+ * with interrupts disabled.
+ */
+int timer_interrupt(struct pt_regs * regs)
+{
+	struct paca_struct *lpaca = get_paca();
+
+	irq_enter();
+
+	profile_tick(CPU_PROFILING, regs);
+
+	lpaca->lppaca.int_dword.fields.decr_int = 0;
+
+	account_ticks(regs);
 
 #ifdef CONFIG_PPC_ISERIES
 	if (hvlpevent_is_pending())
@@ -381,6 +388,73 @@ int timer_interrupt(struct pt_regs * reg
 	return 1;
 }
 
+#ifdef CONFIG_NO_IDLE_HZ
+
+#define MAX_DEC_COUNT	(UINT_MAX)	/* Decrementer is 32-bit */
+#define MIN_SKIP	2
+#define MAX_SKIP	(MAX_DEC_COUNT/tb_ticks_per_jiffy)
+
+int sysctl_hz_timer = 1;
+
+/* Avoid the HZ timer (decrementer) exception on this CPU for "some" time.
+ * Has to be called with interrupts disabled.
+ *
+ * The HZ timer frequency is restored upon the occurence of an interrupt or
+ * exception on this CPU.
+ */
+void stop_hz_timer(void)
+{
+	unsigned long cpu = smp_processor_id(), seq, delta;
+	int next_dec;
+
+	if (sysctl_hz_timer != 0 || cpu == boot_cpuid)
+		return;
+
+	cpu_set(cpu, nohz_cpu_mask);
+	mb();
+	if (rcu_pending(cpu) || local_softirq_pending()) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		return;
+	}
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		delta = next_timer_interrupt() - jiffies;
+
+		if (delta < MIN_SKIP) {
+			cpu_clear(cpu, nohz_cpu_mask);
+			return;
+		}
+
+		if (delta > MAX_SKIP)
+			delta = MAX_SKIP;
+
+		next_dec = tb_last_stamp + (delta-1) * tb_ticks_per_jiffy;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	next_dec -= get_tb();
+	set_dec(next_dec);
+
+	return;
+}
+
+/* Take into account skipped ticks and restore the HZ timer frequency */
+void start_hz_timer(struct pt_regs *regs)
+{
+	unsigned long cpu = smp_processor_id();
+
+	if (!cpu_isset(cpu, nohz_cpu_mask))
+		return;
+
+	cpu_clear(cpu, nohz_cpu_mask);
+	account_ticks(regs);
+}
+
+#endif	/* CONFIG_NO_IDLE_HZ */
+
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  *
diff -puN arch/ppc64/kernel/irq.c~ppc64 arch/ppc64/kernel/irq.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/irq.c~ppc64	2005-09-28 19:35:36.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/irq.c	2005-10-02 21:42:03.000000000 +0530
@@ -55,6 +55,7 @@
 #include <asm/iSeries/ItLpQueue.h>
 #include <asm/machdep.h>
 #include <asm/paca.h>
+#include <asm/time.h>
 
 #ifdef CONFIG_SMP
 extern void iSeries_smp_message_recv( struct pt_regs * );
@@ -313,6 +314,8 @@ void do_IRQ(struct pt_regs *regs)
 
 	irq_enter();
 
+	start_hz_timer(regs);
+
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 2KB free? */
 	{
diff -puN arch/ppc64/kernel/head.S~ppc64 arch/ppc64/kernel/head.S
--- linux-2.6.14-rc1/arch/ppc64/kernel/head.S~ppc64	2005-09-28 19:35:36.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/head.S	2005-10-02 22:45:44.000000000 +0530
@@ -355,6 +355,13 @@ label##_iSeries:							\
 
 #endif
 
+#ifdef CONFIG_NO_IDLE_HZ
+#define START_HZ_TIMER		\
+	bl	.start_hz_timer
+#else
+#define START_HZ_TIMER
+#endif
+
 #define STD_EXCEPTION_COMMON(trap, label, hdlr)		\
 	.align	7;					\
 	.globl label##_common;				\
@@ -363,6 +370,7 @@ label##_common:						\
 	DISABLE_INTS;					\
 	bl	.save_nvgprs;				\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;		\
+	START_HZ_TIMER;					\
 	bl	hdlr;					\
 	b	.ret_from_except
 
@@ -373,6 +381,7 @@ label##_common:						\
 	EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);	\
 	DISABLE_INTS;					\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;		\
+	START_HZ_TIMER;					\
 	bl	hdlr;					\
 	b	.ret_from_except_lite
 
diff -puN include/asm-ppc64/time.h~ppc64 include/asm-ppc64/time.h
--- linux-2.6.14-rc1/include/asm-ppc64/time.h~ppc64	2005-09-28 19:43:54.000000000 +0530
+++ linux-2.6.14-rc1-root/include/asm-ppc64/time.h	2005-10-02 21:32:08.000000000 +0530
@@ -102,6 +102,14 @@ static inline unsigned long tb_ticks_sin
 	return get_tb() - tstamp;
 }
 
+#ifdef CONFIG_NO_IDLE_HZ
+extern void stop_hz_timer(void);
+extern void start_hz_timer(struct pt_regs *);
+#else
+static inline void stop_hz_timer(void) { }
+static inline void start_hz_timer(struct pt_regs *regs) { }
+#endif
+
 #define mulhwu(x,y) \
 ({unsigned z; asm ("mulhwu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;})
 #define mulhdu(x,y) \
diff -puN arch/ppc64/Kconfig~ppc64 arch/ppc64/Kconfig
--- linux-2.6.14-rc1/arch/ppc64/Kconfig~ppc64	2005-09-28 20:08:39.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/Kconfig	2005-10-01 15:45:06.000000000 +0530
@@ -146,6 +146,12 @@ config PPC_SPLPAR
 	  processors, that is, which share physical processors between
 	  two or more partitions.
 
+config NO_IDLE_HZ
+	depends on EXPERIMENTAL && PPC_PSERIES
+	bool "No HZ timer ticks in idle"
+	help
+	  Switches the HZ timer interrupts off when a CPU is idle.
+
 config KEXEC
 	bool "kexec system call (EXPERIMENTAL)"
 	depends on PPC_MULTIPLATFORM && EXPERIMENTAL
diff -puN kernel/sysctl.c~ppc64 kernel/sysctl.c
--- linux-2.6.14-rc1/kernel/sysctl.c~ppc64	2005-09-28 21:08:05.000000000 +0530
+++ linux-2.6.14-rc1-root/kernel/sysctl.c	2005-10-01 10:55:32.000000000 +0530
@@ -544,6 +544,16 @@ static ctl_table kern_table[] = {
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
+#ifdef CONFIG_NO_IDLE_HZ
+	{
+		.ctl_name       = KERN_HZ_TIMER,
+		.procname       = "hz_timer",
+		.data           = &sysctl_hz_timer,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_ARCH_S390
 #ifdef CONFIG_MATHEMU
 	{
@@ -555,16 +565,6 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_NO_IDLE_HZ
-	{
-		.ctl_name       = KERN_HZ_TIMER,
-		.procname       = "hz_timer",
-		.data           = &sysctl_hz_timer,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
-	},
-#endif
 	{
 		.ctl_name	= KERN_S390_USER_DEBUG_LOGGING,
 		.procname	= "userprocess_debug",
diff -puN arch/ppc64/kernel/pSeries_setup.c~ppc64 arch/ppc64/kernel/pSeries_setup.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/pSeries_setup.c~ppc64	2005-10-01 11:02:18.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/pSeries_setup.c	2005-10-01 11:10:50.000000000 +0530
@@ -475,9 +475,10 @@ static inline void dedicated_idle_sleep(
 		 * a prod occurs.  Returning from the cede enables external
 		 * interrupts.
 		 */
-		if (!need_resched())
+		if (!need_resched()) {
+			stop_hz_timer();
 			cede_processor();
-		else
+		} else
 			local_irq_enable();
 	} else {
 		/*
@@ -570,9 +571,10 @@ static int pseries_shared_idle(void)
 			 * Check need_resched() again with interrupts disabled
 			 * to avoid a race.
 			 */
-			if (!need_resched())
+			if (!need_resched()) {
+				stop_hz_timer();
 				cede_processor();
-			else
+			} else
 				local_irq_enable();
 
 			HMT_medium();

_
  

Now the test patch. It was something quick that I wrote to get the data
I needed. Are the decrementer exception statistics available somewhere already?
Also I assume that there are 4 CPUs in the m/c!


---

 linux-2.6.14-rc1-root/arch/ppc64/kernel/idle.c |    5 +++++
 linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c |    4 ++++
 linux-2.6.14-rc1-root/fs/proc/proc_misc.c      |   12 ++++++++++--
 3 files changed, 19 insertions(+), 2 deletions(-)

diff -puN arch/ppc64/kernel/time.c~debug arch/ppc64/kernel/time.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/time.c~debug	2005-10-02 22:56:58.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c	2005-10-02 22:56:58.000000000 +0530
@@ -315,6 +315,8 @@ static void iSeries_tb_recal(void)
 
 unsigned long tb_last_stamp __cacheline_aligned_in_smp;
 
+DEFINE_PER_CPU(int, dec_ticks);
+
 static void account_ticks(struct pt_regs *regs)
 {
 	int next_dec;
@@ -366,6 +368,8 @@ int timer_interrupt(struct pt_regs * reg
 
 	irq_enter();
 
+	__get_cpu_var(dec_ticks) += 1;
+
 	profile_tick(CPU_PROFILING, regs);
 
 	lpaca->lppaca.int_dword.fields.decr_int = 0;
diff -puN fs/proc/proc_misc.c~debug fs/proc/proc_misc.c
--- linux-2.6.14-rc1/fs/proc/proc_misc.c~debug	2005-10-02 22:56:58.000000000 +0530
+++ linux-2.6.14-rc1-root/fs/proc/proc_misc.c	2005-10-02 22:56:58.000000000 +0530
@@ -233,13 +233,21 @@ static struct file_operations proc_zonei
 	.release	= seq_release,
 };
 
+DECLARE_PER_CPU(int, dec_ticks);
+
 static int version_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
 	int len;
+	char *cp = page;
 
-	strcpy(page, linux_banner);
-	len = strlen(page);
+	cp += sprintf(cp, "%s\n", linux_banner);
+	cp += sprintf (cp, "%d %d %d %d \n",
+			per_cpu(dec_ticks, 0),
+			per_cpu(dec_ticks, 1),
+			per_cpu(dec_ticks, 2),
+			per_cpu(dec_ticks, 3));
+	len = cp - page;
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
diff -puN arch/ppc64/kernel/idle.c~debug arch/ppc64/kernel/idle.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/idle.c~debug	2005-10-02 22:56:58.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/idle.c	2005-10-02 22:56:58.000000000 +0530
@@ -45,6 +45,11 @@ int default_idle(void)
 			while (!need_resched() && !cpu_is_offline(cpu)) {
 				ppc64_runlatch_off();
 
+				local_irq_disable();
+				if (!need_resched())
+					stop_hz_timer();
+				local_irq_enable();
+
 				/*
 				 * Go into low thread priority and possibly
 				 * low power mode.

_



-- 


Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017



More information about the Linuxppc64-dev mailing list