[PATCH] NO_IDLE_HZ implementation for PPC64
Srivatsa Vaddagiri
vatsa at in.ibm.com
Mon Oct 3 03:46:30 EST 2005
Hello,
The patch below implements NO_IDLE_HZ support for pSeries/PPC64. It
basically lets idle CPUs to cut off their timer ticks until they can.
Some notes about the patch:
- Patch is against 2.6.14-rc1 and has undergone some basic test
(with an additional patch - also in the mail) on a Power4 box. I intend to
test on a Power5 box also sometime soon.
- Only pseries_shared_idle and pseries_dedicated_idle routines
have been converted over to use this support, since I felt
cutting off ticks doesnt make sense if the idle routine is
poll-based.
- Boot CPU cannot skip ticks. This is because of the current design wherein
only boot CPU updates wall-clock/jiffies.
I didn't see any particular reason why it has been designed like that
(maybe to reduce lock contention on xtime_lock?). If we have to allow
boot CPU also to skip ticks (which IMO we should), then this design
needs to change, i.e we should allow xtime/jiffies to be updated
from any CPU (like S390 allows). If people agree that this is the
right direction, then I can give it a shot next.
- By default the feature is disabled at bootup and has to be enabled
by writing 0 to /proc/sys/kernel/hz_timer. This can be modifed
later after the patch has undergone sufficient test. Also we can
introduce a boottime argument to control this behaviour.
- One requirement is that a call to start_hz_timer should be inserted
in every possible interrupt path. Towards this end, have I missed
some interrupt paths? Or have I included some exception path which
I shouldn't be!
Below are both the patches - actual patch and the patch which I used to
test on Power4 box.
First the actual NO_IDLE_HZ patch:
---
linux-2.6.14-rc1-root/arch/ppc64/Kconfig | 6 +
linux-2.6.14-rc1-root/arch/ppc64/kernel/head.S | 9 +
linux-2.6.14-rc1-root/arch/ppc64/kernel/irq.c | 3
linux-2.6.14-rc1-root/arch/ppc64/kernel/pSeries_setup.c | 10 +
linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c | 96 ++++++++++++++--
linux-2.6.14-rc1-root/include/asm-ppc64/time.h | 8 +
linux-2.6.14-rc1-root/kernel/sysctl.c | 20 +--
7 files changed, 127 insertions(+), 25 deletions(-)
diff -puN arch/ppc64/kernel/time.c~ppc64 arch/ppc64/kernel/time.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/time.c~ppc64 2005-09-28 19:35:36.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c 2005-10-02 22:53:44.000000000 +0530
@@ -315,23 +315,13 @@ static void iSeries_tb_recal(void)
unsigned long tb_last_stamp __cacheline_aligned_in_smp;
-/*
- * timer_interrupt - gets called when the decrementer overflows,
- * with interrupts disabled.
- */
-int timer_interrupt(struct pt_regs * regs)
+static void account_ticks(struct pt_regs *regs)
{
int next_dec;
unsigned long cur_tb;
struct paca_struct *lpaca = get_paca();
unsigned long cpu = smp_processor_id();
- irq_enter();
-
- profile_tick(CPU_PROFILING, regs);
-
- lpaca->lppaca.int_dword.fields.decr_int = 0;
-
while (lpaca->next_jiffy_update_tb <= (cur_tb = get_tb())) {
/*
* We cannot disable the decrementer, so in the period
@@ -364,6 +354,23 @@ int timer_interrupt(struct pt_regs * reg
if (next_dec > lpaca->default_decr)
next_dec = lpaca->default_decr;
set_dec(next_dec);
+}
+
+/*
+ * timer_interrupt - gets called when the decrementer overflows,
+ * with interrupts disabled.
+ */
+int timer_interrupt(struct pt_regs * regs)
+{
+ struct paca_struct *lpaca = get_paca();
+
+ irq_enter();
+
+ profile_tick(CPU_PROFILING, regs);
+
+ lpaca->lppaca.int_dword.fields.decr_int = 0;
+
+ account_ticks(regs);
#ifdef CONFIG_PPC_ISERIES
if (hvlpevent_is_pending())
@@ -381,6 +388,73 @@ int timer_interrupt(struct pt_regs * reg
return 1;
}
+#ifdef CONFIG_NO_IDLE_HZ
+
+#define MAX_DEC_COUNT (UINT_MAX) /* Decrementer is 32-bit */
+#define MIN_SKIP 2
+#define MAX_SKIP (MAX_DEC_COUNT/tb_ticks_per_jiffy)
+
+int sysctl_hz_timer = 1;
+
+/* Avoid the HZ timer (decrementer) exception on this CPU for "some" time.
+ * Has to be called with interrupts disabled.
+ *
+ * The HZ timer frequency is restored upon the occurence of an interrupt or
+ * exception on this CPU.
+ */
+void stop_hz_timer(void)
+{
+ unsigned long cpu = smp_processor_id(), seq, delta;
+ int next_dec;
+
+ if (sysctl_hz_timer != 0 || cpu == boot_cpuid)
+ return;
+
+ cpu_set(cpu, nohz_cpu_mask);
+ mb();
+ if (rcu_pending(cpu) || local_softirq_pending()) {
+ cpu_clear(cpu, nohz_cpu_mask);
+ return;
+ }
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ delta = next_timer_interrupt() - jiffies;
+
+ if (delta < MIN_SKIP) {
+ cpu_clear(cpu, nohz_cpu_mask);
+ return;
+ }
+
+ if (delta > MAX_SKIP)
+ delta = MAX_SKIP;
+
+ next_dec = tb_last_stamp + (delta-1) * tb_ticks_per_jiffy;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ next_dec -= get_tb();
+ set_dec(next_dec);
+
+ return;
+}
+
+/* Take into account skipped ticks and restore the HZ timer frequency */
+void start_hz_timer(struct pt_regs *regs)
+{
+ unsigned long cpu = smp_processor_id();
+
+ if (!cpu_isset(cpu, nohz_cpu_mask))
+ return;
+
+ cpu_clear(cpu, nohz_cpu_mask);
+ account_ticks(regs);
+}
+
+#endif /* CONFIG_NO_IDLE_HZ */
+
+
/*
* Scheduler clock - returns current time in nanosec units.
*
diff -puN arch/ppc64/kernel/irq.c~ppc64 arch/ppc64/kernel/irq.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/irq.c~ppc64 2005-09-28 19:35:36.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/irq.c 2005-10-02 21:42:03.000000000 +0530
@@ -55,6 +55,7 @@
#include <asm/iSeries/ItLpQueue.h>
#include <asm/machdep.h>
#include <asm/paca.h>
+#include <asm/time.h>
#ifdef CONFIG_SMP
extern void iSeries_smp_message_recv( struct pt_regs * );
@@ -313,6 +314,8 @@ void do_IRQ(struct pt_regs *regs)
irq_enter();
+ start_hz_timer(regs);
+
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/* Debugging check for stack overflow: is there less than 2KB free? */
{
diff -puN arch/ppc64/kernel/head.S~ppc64 arch/ppc64/kernel/head.S
--- linux-2.6.14-rc1/arch/ppc64/kernel/head.S~ppc64 2005-09-28 19:35:36.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/head.S 2005-10-02 22:45:44.000000000 +0530
@@ -355,6 +355,13 @@ label##_iSeries: \
#endif
+#ifdef CONFIG_NO_IDLE_HZ
+#define START_HZ_TIMER \
+ bl .start_hz_timer
+#else
+#define START_HZ_TIMER
+#endif
+
#define STD_EXCEPTION_COMMON(trap, label, hdlr) \
.align 7; \
.globl label##_common; \
@@ -363,6 +370,7 @@ label##_common: \
DISABLE_INTS; \
bl .save_nvgprs; \
addi r3,r1,STACK_FRAME_OVERHEAD; \
+ START_HZ_TIMER; \
bl hdlr; \
b .ret_from_except
@@ -373,6 +381,7 @@ label##_common: \
EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \
DISABLE_INTS; \
addi r3,r1,STACK_FRAME_OVERHEAD; \
+ START_HZ_TIMER; \
bl hdlr; \
b .ret_from_except_lite
diff -puN include/asm-ppc64/time.h~ppc64 include/asm-ppc64/time.h
--- linux-2.6.14-rc1/include/asm-ppc64/time.h~ppc64 2005-09-28 19:43:54.000000000 +0530
+++ linux-2.6.14-rc1-root/include/asm-ppc64/time.h 2005-10-02 21:32:08.000000000 +0530
@@ -102,6 +102,14 @@ static inline unsigned long tb_ticks_sin
return get_tb() - tstamp;
}
+#ifdef CONFIG_NO_IDLE_HZ
+extern void stop_hz_timer(void);
+extern void start_hz_timer(struct pt_regs *);
+#else
+static inline void stop_hz_timer(void) { }
+static inline void start_hz_timer(struct pt_regs *regs) { }
+#endif
+
#define mulhwu(x,y) \
({unsigned z; asm ("mulhwu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;})
#define mulhdu(x,y) \
diff -puN arch/ppc64/Kconfig~ppc64 arch/ppc64/Kconfig
--- linux-2.6.14-rc1/arch/ppc64/Kconfig~ppc64 2005-09-28 20:08:39.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/Kconfig 2005-10-01 15:45:06.000000000 +0530
@@ -146,6 +146,12 @@ config PPC_SPLPAR
processors, that is, which share physical processors between
two or more partitions.
+config NO_IDLE_HZ
+ depends on EXPERIMENTAL && PPC_PSERIES
+ bool "No HZ timer ticks in idle"
+ help
+ Switches the HZ timer interrupts off when a CPU is idle.
+
config KEXEC
bool "kexec system call (EXPERIMENTAL)"
depends on PPC_MULTIPLATFORM && EXPERIMENTAL
diff -puN kernel/sysctl.c~ppc64 kernel/sysctl.c
--- linux-2.6.14-rc1/kernel/sysctl.c~ppc64 2005-09-28 21:08:05.000000000 +0530
+++ linux-2.6.14-rc1-root/kernel/sysctl.c 2005-10-01 10:55:32.000000000 +0530
@@ -544,6 +544,16 @@ static ctl_table kern_table[] = {
.extra1 = &minolduid,
.extra2 = &maxolduid,
},
+#ifdef CONFIG_NO_IDLE_HZ
+ {
+ .ctl_name = KERN_HZ_TIMER,
+ .procname = "hz_timer",
+ .data = &sysctl_hz_timer,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
#ifdef CONFIG_ARCH_S390
#ifdef CONFIG_MATHEMU
{
@@ -555,16 +565,6 @@ static ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
-#ifdef CONFIG_NO_IDLE_HZ
- {
- .ctl_name = KERN_HZ_TIMER,
- .procname = "hz_timer",
- .data = &sysctl_hz_timer,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
-#endif
{
.ctl_name = KERN_S390_USER_DEBUG_LOGGING,
.procname = "userprocess_debug",
diff -puN arch/ppc64/kernel/pSeries_setup.c~ppc64 arch/ppc64/kernel/pSeries_setup.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/pSeries_setup.c~ppc64 2005-10-01 11:02:18.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/pSeries_setup.c 2005-10-01 11:10:50.000000000 +0530
@@ -475,9 +475,10 @@ static inline void dedicated_idle_sleep(
* a prod occurs. Returning from the cede enables external
* interrupts.
*/
- if (!need_resched())
+ if (!need_resched()) {
+ stop_hz_timer();
cede_processor();
- else
+ } else
local_irq_enable();
} else {
/*
@@ -570,9 +571,10 @@ static int pseries_shared_idle(void)
* Check need_resched() again with interrupts disabled
* to avoid a race.
*/
- if (!need_resched())
+ if (!need_resched()) {
+ stop_hz_timer();
cede_processor();
- else
+ } else
local_irq_enable();
HMT_medium();
_
Now the test patch. It was something quick that I wrote to get the data
I needed. Are the decrementer exception statistics available somewhere already?
Also I assume that there are 4 CPUs in the m/c!
---
linux-2.6.14-rc1-root/arch/ppc64/kernel/idle.c | 5 +++++
linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c | 4 ++++
linux-2.6.14-rc1-root/fs/proc/proc_misc.c | 12 ++++++++++--
3 files changed, 19 insertions(+), 2 deletions(-)
diff -puN arch/ppc64/kernel/time.c~debug arch/ppc64/kernel/time.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/time.c~debug 2005-10-02 22:56:58.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c 2005-10-02 22:56:58.000000000 +0530
@@ -315,6 +315,8 @@ static void iSeries_tb_recal(void)
unsigned long tb_last_stamp __cacheline_aligned_in_smp;
+DEFINE_PER_CPU(int, dec_ticks);
+
static void account_ticks(struct pt_regs *regs)
{
int next_dec;
@@ -366,6 +368,8 @@ int timer_interrupt(struct pt_regs * reg
irq_enter();
+ __get_cpu_var(dec_ticks) += 1;
+
profile_tick(CPU_PROFILING, regs);
lpaca->lppaca.int_dword.fields.decr_int = 0;
diff -puN fs/proc/proc_misc.c~debug fs/proc/proc_misc.c
--- linux-2.6.14-rc1/fs/proc/proc_misc.c~debug 2005-10-02 22:56:58.000000000 +0530
+++ linux-2.6.14-rc1-root/fs/proc/proc_misc.c 2005-10-02 22:56:58.000000000 +0530
@@ -233,13 +233,21 @@ static struct file_operations proc_zonei
.release = seq_release,
};
+DECLARE_PER_CPU(int, dec_ticks);
+
static int version_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
int len;
+ char *cp = page;
- strcpy(page, linux_banner);
- len = strlen(page);
+ cp += sprintf(cp, "%s\n", linux_banner);
+ cp += sprintf (cp, "%d %d %d %d \n",
+ per_cpu(dec_ticks, 0),
+ per_cpu(dec_ticks, 1),
+ per_cpu(dec_ticks, 2),
+ per_cpu(dec_ticks, 3));
+ len = cp - page;
return proc_calc_metrics(page, start, off, count, eof, len);
}
diff -puN arch/ppc64/kernel/idle.c~debug arch/ppc64/kernel/idle.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/idle.c~debug 2005-10-02 22:56:58.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/idle.c 2005-10-02 22:56:58.000000000 +0530
@@ -45,6 +45,11 @@ int default_idle(void)
while (!need_resched() && !cpu_is_offline(cpu)) {
ppc64_runlatch_off();
+ local_irq_disable();
+ if (!need_resched())
+ stop_hz_timer();
+ local_irq_enable();
+
/*
* Go into low thread priority and possibly
* low power mode.
_
--
Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017
More information about the Linuxppc64-dev
mailing list