[PATCH] NO_IDLE_HZ implementation for ppc64 - v2
Srivatsa Vaddagiri
vatsa at in.ibm.com
Thu Oct 6 04:01:42 EST 2005
Ben,
Here's the revised version, taking into account some of the
comments you had. Changes since last time:
- native_idle also converted over
- Fixed a bug in calculation of next_dec in stop_hz_timer
- Removed call to start_hz_timer from head.S
- Added a call to start_hz_timer in performance_monitor_exception
This has been tested against 2.6.14-rc1 on a 4way Power4 box (p630)
with some additional patch (the same test patch I had sent earlier
which showed decrementer statistics in /proc).
I will rebase this patch against latest -mm if you think this is in
the right direction.
Signed-off-by: Srivatsa Vaddagiri <vatsa at in.ibm.com>
---
linux-2.6.14-rc1-root/arch/ppc64/Kconfig | 6
linux-2.6.14-rc1-root/arch/ppc64/kernel/idle.c | 6
linux-2.6.14-rc1-root/arch/ppc64/kernel/irq.c | 3
linux-2.6.14-rc1-root/arch/ppc64/kernel/pSeries_setup.c | 10 -
linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c | 112 ++++++++++++++--
linux-2.6.14-rc1-root/arch/ppc64/kernel/traps.c | 2
linux-2.6.14-rc1-root/include/asm-ppc64/time.h | 8 +
linux-2.6.14-rc1-root/kernel/sysctl.c | 20 +-
8 files changed, 141 insertions(+), 26 deletions(-)
diff -puN arch/ppc64/kernel/time.c~ppc64 arch/ppc64/kernel/time.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/time.c~ppc64 2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/time.c 2005-10-05 16:34:51.000000000 +0530
@@ -315,23 +315,13 @@ static void iSeries_tb_recal(void)
unsigned long tb_last_stamp __cacheline_aligned_in_smp;
-/*
- * timer_interrupt - gets called when the decrementer overflows,
- * with interrupts disabled.
- */
-int timer_interrupt(struct pt_regs * regs)
+static void account_ticks(struct pt_regs *regs)
{
int next_dec;
unsigned long cur_tb;
struct paca_struct *lpaca = get_paca();
unsigned long cpu = smp_processor_id();
- irq_enter();
-
- profile_tick(CPU_PROFILING, regs);
-
- lpaca->lppaca.int_dword.fields.decr_int = 0;
-
while (lpaca->next_jiffy_update_tb <= (cur_tb = get_tb())) {
/*
* We cannot disable the decrementer, so in the period
@@ -364,6 +354,43 @@ int timer_interrupt(struct pt_regs * reg
if (next_dec > lpaca->default_decr)
next_dec = lpaca->default_decr;
set_dec(next_dec);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+/* Returns 1 if this CPU was set in the mask */
+static inline int clear_hzless_mask(void)
+{
+ unsigned long cpu = smp_processor_id();
+ int rc = 0;
+
+ if (unlikely(cpu_isset(cpu, nohz_cpu_mask))) {
+ cpu_clear(cpu, nohz_cpu_mask);
+ rc = 1;
+ }
+
+ return rc;
+}
+#else
+static inline int clear_hzless_mask(void) { return 0;}
+#endif
+
+/*
+ * timer_interrupt - gets called when the decrementer overflows,
+ * with interrupts disabled.
+ */
+int timer_interrupt(struct pt_regs * regs)
+{
+ struct paca_struct *lpaca = get_paca();
+
+ irq_enter();
+
+ clear_hzless_mask();
+
+ profile_tick(CPU_PROFILING, regs);
+
+ lpaca->lppaca.int_dword.fields.decr_int = 0;
+
+ account_ticks(regs);
#ifdef CONFIG_PPC_ISERIES
if (hvlpevent_is_pending())
@@ -381,6 +408,69 @@ int timer_interrupt(struct pt_regs * reg
return 1;
}
+#ifdef CONFIG_NO_IDLE_HZ
+
+#define MAX_DEC_COUNT (UINT_MAX) /* Decrementer is 32-bit */
+#define MIN_SKIP 2
+#define MAX_SKIP (MAX_DEC_COUNT/tb_ticks_per_jiffy)
+
+int sysctl_hz_timer = 1;
+
+/* Avoid the HZ timer (decrementer) interrupt on this CPU for "some" time.
+ * Has to be called with interrupts disabled.
+ *
+ * The HZ timer frequency is restored upon the occurence of an interrupt or
+ * exception on this CPU. Caller has to ensure that the CPU doesnt exit
+ * idle mode via other means.
+ */
+void stop_hz_timer(void)
+{
+ unsigned long cpu = smp_processor_id(), seq, delta;
+ int next_dec;
+
+ if (sysctl_hz_timer != 0)
+ return;
+
+ cpu_set(cpu, nohz_cpu_mask);
+ mb();
+ if (rcu_pending(cpu) || local_softirq_pending()) {
+ cpu_clear(cpu, nohz_cpu_mask);
+ return;
+ }
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ delta = next_timer_interrupt() - jiffies;
+
+ if (delta < MIN_SKIP) {
+ cpu_clear(cpu, nohz_cpu_mask);
+ return;
+ }
+
+ if (delta > MAX_SKIP)
+ delta = MAX_SKIP;
+
+ next_dec = tb_last_stamp + delta * tb_ticks_per_jiffy;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ next_dec -= get_tb();
+ set_dec(next_dec);
+
+ return;
+}
+
+/* Take into account skipped ticks and restore the HZ timer frequency */
+void start_hz_timer(struct pt_regs *regs)
+{
+ if (clear_hzless_mask())
+ account_ticks(regs);
+}
+
+#endif /* CONFIG_NO_IDLE_HZ */
+
+
/*
* Scheduler clock - returns current time in nanosec units.
*
diff -puN arch/ppc64/kernel/irq.c~ppc64 arch/ppc64/kernel/irq.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/irq.c~ppc64 2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/irq.c 2005-10-05 16:33:06.000000000 +0530
@@ -55,6 +55,7 @@
#include <asm/iSeries/ItLpQueue.h>
#include <asm/machdep.h>
#include <asm/paca.h>
+#include <asm/time.h>
#ifdef CONFIG_SMP
extern void iSeries_smp_message_recv( struct pt_regs * );
@@ -313,6 +314,8 @@ void do_IRQ(struct pt_regs *regs)
irq_enter();
+ start_hz_timer(regs);
+
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/* Debugging check for stack overflow: is there less than 2KB free? */
{
diff -puN include/asm-ppc64/time.h~ppc64 include/asm-ppc64/time.h
--- linux-2.6.14-rc1/include/asm-ppc64/time.h~ppc64 2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/include/asm-ppc64/time.h 2005-10-05 16:33:06.000000000 +0530
@@ -102,6 +102,14 @@ static inline unsigned long tb_ticks_sin
return get_tb() - tstamp;
}
+#ifdef CONFIG_NO_IDLE_HZ
+extern void stop_hz_timer(void);
+extern void start_hz_timer(struct pt_regs *);
+#else
+static inline void stop_hz_timer(void) { }
+static inline void start_hz_timer(struct pt_regs *regs) { }
+#endif
+
#define mulhwu(x,y) \
({unsigned z; asm ("mulhwu %0,%1,%2" : "=r" (z) : "r" (x), "r" (y)); z;})
#define mulhdu(x,y) \
diff -puN arch/ppc64/Kconfig~ppc64 arch/ppc64/Kconfig
--- linux-2.6.14-rc1/arch/ppc64/Kconfig~ppc64 2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/Kconfig 2005-10-05 16:33:06.000000000 +0530
@@ -146,6 +146,12 @@ config PPC_SPLPAR
processors, that is, which share physical processors between
two or more partitions.
+config NO_IDLE_HZ
+ depends on EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC || PPC_MAPLE)
+ bool "No HZ timer ticks in idle"
+ help
+ Switches the HZ timer interrupts off when a CPU is idle.
+
config KEXEC
bool "kexec system call (EXPERIMENTAL)"
depends on PPC_MULTIPLATFORM && EXPERIMENTAL
diff -puN kernel/sysctl.c~ppc64 kernel/sysctl.c
--- linux-2.6.14-rc1/kernel/sysctl.c~ppc64 2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/kernel/sysctl.c 2005-10-05 16:33:06.000000000 +0530
@@ -544,6 +544,16 @@ static ctl_table kern_table[] = {
.extra1 = &minolduid,
.extra2 = &maxolduid,
},
+#ifdef CONFIG_NO_IDLE_HZ
+ {
+ .ctl_name = KERN_HZ_TIMER,
+ .procname = "hz_timer",
+ .data = &sysctl_hz_timer,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
#ifdef CONFIG_ARCH_S390
#ifdef CONFIG_MATHEMU
{
@@ -555,16 +565,6 @@ static ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
-#ifdef CONFIG_NO_IDLE_HZ
- {
- .ctl_name = KERN_HZ_TIMER,
- .procname = "hz_timer",
- .data = &sysctl_hz_timer,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
-#endif
{
.ctl_name = KERN_S390_USER_DEBUG_LOGGING,
.procname = "userprocess_debug",
diff -puN arch/ppc64/kernel/pSeries_setup.c~ppc64 arch/ppc64/kernel/pSeries_setup.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/pSeries_setup.c~ppc64 2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/pSeries_setup.c 2005-10-05 16:33:06.000000000 +0530
@@ -475,9 +475,10 @@ static inline void dedicated_idle_sleep(
* a prod occurs. Returning from the cede enables external
* interrupts.
*/
- if (!need_resched())
+ if (!need_resched()) {
+ stop_hz_timer();
cede_processor();
- else
+ } else
local_irq_enable();
} else {
/*
@@ -570,9 +571,10 @@ static int pseries_shared_idle(void)
* Check need_resched() again with interrupts disabled
* to avoid a race.
*/
- if (!need_resched())
+ if (!need_resched()) {
+ stop_hz_timer();
cede_processor();
- else
+ } else
local_irq_enable();
HMT_medium();
diff -puN arch/ppc64/kernel/traps.c~ppc64 arch/ppc64/kernel/traps.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/traps.c~ppc64 2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/traps.c 2005-10-05 16:33:06.000000000 +0530
@@ -43,6 +43,7 @@
#include <asm/systemcfg.h>
#include <asm/machdep.h>
#include <asm/pmc.h>
+#include <asm/time.h>
#ifdef CONFIG_DEBUGGER
int (*__debugger)(struct pt_regs *regs);
@@ -470,6 +471,7 @@ extern perf_irq_t perf_irq;
void performance_monitor_exception(struct pt_regs *regs)
{
+ start_hz_timer(regs);
perf_irq(regs);
}
diff -puN arch/ppc64/kernel/idle.c~ppc64 arch/ppc64/kernel/idle.c
--- linux-2.6.14-rc1/arch/ppc64/kernel/idle.c~ppc64 2005-10-05 16:33:06.000000000 +0530
+++ linux-2.6.14-rc1-root/arch/ppc64/kernel/idle.c 2005-10-05 16:33:06.000000000 +0530
@@ -73,8 +73,12 @@ int native_idle(void)
while (1) {
ppc64_runlatch_off();
- if (!need_resched())
+ local_irq_disable();
+ if (!need_resched()) {
+ stop_hz_timer();
+ local_irq_enable();
power4_idle();
+ }
if (need_resched()) {
ppc64_runlatch_on();
_
--
Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017
More information about the Linuxppc64-dev
mailing list