[PATCH 2/2] cpufreq: powernv: Ramp-down global pstate slower than local-pstate

Wed Apr 13 04:06:26 AEST 2016

This patch brings down global pstate at a slower rate than the local
pstate. As the frequency transition latency from pmin to pmax is
observed to be in few millisecond granurality. It takes a performance
penalty during sudden frequency rampup. Hence by holding global pstates
higher than local pstate makes the subsequent rampups faster.

A global per policy structure is maintained to keep track of the global
and local pstate changes. The global pstate is brought down using a
parabolic equation. The ramp down time to pmin is set to 6 seconds. To
make sure that the global pstates are dropped at regular interval , a
timer is queued for every 2 seconds, which eventually brings the pstate
down to local pstate.

Iozone results show fairly consistent performance boost.
YCSB on redis shows improved Max latencies in most cases.

Iozone write/rewite test were made with filesizes 200704Kb and 401408Kb with
different record sizes . The following table shows IOoperations/sec with and
without patch.

Iozone Results ( in op/sec) ( mean over 3 iterations )
------------------------------------
file size-	   		with     	without
recordsize-IOtype      		patch    	patch 		 % change
----------------------------------------------------------------------
200704-1-SeqWrite		1616532 	1615425 	0.06
200704-1-Rewrite		2423195  	2303130 	5.21
200704-2-SeqWrite		1628577 	1602620 	1.61
200704-2-Rewrite		2428264  	2312154 	5.02
200704-4-SeqWrite		1617605 	1617182 	0.02
200704-4-Rewrite		2430524  	2351238 	3.37
200704-8-SeqWrite		1629478 	1600436 	1.81
200704-8-Rewrite		2415308  	2298136 	5.09
200704-16-SeqWrite		1619632 	1618250 	0.08
200704-16-Rewrite		2396650 	2352591 	1.87
200704-32-SeqWrite		1632544 	1598083 	2.15
200704-32-Rewrite		2425119 	2329743 	4.09
200704-64-SeqWrite		1617812 	1617235 	0.03
200704-64-Rewrite		2402021 	2321080 	3.48
200704-128-SeqWrite		1631998 	1600256 	1.98
200704-128-Rewrite		2422389		2304954 	5.09
200704-256 SeqWrite		1617065		1616962 	0.00
200704-256-Rewrite		2432539 	2301980 	5.67
200704-512-SeqWrite		1632599 	1598656 	2.12
200704-512-Rewrite		2429270		2323676 	4.54
200704-1024-SeqWrite		1618758		1616156 	0.16
200704-1024-Rewrite		2431631		2315889 	4.99
401408-1-SeqWrite		1631479  	1608132 	1.45
401408-1-Rewrite		2501550  	2459409 	1.71
401408-2-SeqWrite		1617095 	1626069 	-0.55
401408-2-Rewrite		2507557  	2443621 	2.61
401408-4-SeqWrite		1629601 	1611869		1.10
401408-4-Rewrite		2505909  	2462098 	1.77
401408-8-SeqWrite  		1617110 	1626968 	-0.60
401408-8-Rewrite		2512244  	2456827 	2.25
401408-16-SeqWrite		1632609 	1609603 	1.42
401408-16-Rewrite		2500792 	2451405 	2.01
401408-32-SeqWrite		1619294 	1628167 	-0.54
401408-32-Rewrite		2510115		2451292 	2.39
401408-64-SeqWrite		1632709		1603746 	1.80
401408-64-Rewrite		2506692 	2433186 	3.02
401408-128-SeqWrite		1619284		1627461 	-0.50
401408-128-Rewrite		2518698 	2453361 	2.66
401408-256-SeqWrite		1634022 	1610681 	1.44
401408-256-Rewrite		2509987 	2446328 	2.60
401408-512-SeqWrite 		1617524 	1628016 	-0.64
401408-512-Rewrite		2504409 	2442899 	2.51
401408-1024-SeqWrite		1629812 	1611566 	1.13
401408-1024-Rewrite 		2507620		 2442968 	2.64

Tested with YCSB workloada over redis for 1 million records and 1 million
operation. Each test was carried out with target operations per second and
persistence disabled. 

Max-latency (in us)( mean over 5 iterations )
-----------------------------------------------------------
op/s	Operation	with patch	without patch	%change
------------------------------------------------------------
15000	Read		61480.6		50261.4		22.32
15000	cleanup		215.2		293.6		-26.70
15000	update		25666.2		25163.8		2.00

25000	Read		32626.2		89525.4		-63.56
25000	cleanup		292.2		263.0		11.10
25000	update		32293.4		90255.0		-64.22

35000	Read		34783.0		33119.0		5.02
35000	cleanup		321.2		395.8		-18.8
35000	update		36047.0		38747.8		-6.97

40000	Read		38562.2		42357.4		-8.96
40000	cleanup		371.8		384.6		-3.33
40000	update		27861.4		41547.8		-32.94

45000	Read		42271.0		88120.6		-52.03
45000	cleanup		263.6		383.0		-31.17
45000	update		29755.8		81359.0		-63.43

(test without target op/s)
47659	Read		83061.4		136440.6	-39.12
47659	cleanup		195.8		193.8		1.03
47659	update		73429.4		124971.8	-41.24

Signed-off-by: Akshay Adiga <akshay.adiga at linux.vnet.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego at linux.vnet.ibm.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 239 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 237 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index e2e2219..288fa10 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -36,12 +36,58 @@
 #include <asm/reg.h>
 #include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
 #include <asm/opal.h>
+#include <linux/timer.h>
 
 #define POWERNV_MAX_PSTATES	256
 #define PMSR_PSAFE_ENABLE	(1UL << 30)
 #define PMSR_SPR_EM_DISABLE	(1UL << 31)
 #define PMSR_MAX(x)		((x >> 32) & 0xFF)
 
+/*
+ * Quadratic equation which gives the percentage rampdown for time elapsed in
+ * milliseconds. time can be between 0 and MAX_RAMP_DOWN_TIME ( milliseconds )
+ * This equation approximates to y = -4e-6 x^2
+ *
+ * At 0 seconds x=0000 ramp_down_percent=0
+ * At MAX_RAMP_DOWN_TIME x=5120 ramp_down_percent=100
+ */
+#define MAX_RAMP_DOWN_TIME				5120
+#define ramp_down_percent(time)		((time * time)>>18)
+
+/*Interval after which the timer is queued to bring down global pstate*/
+#define GPSTATE_TIMER_INTERVAL				2000
+/*
+ * global_pstate_info :
+ *	per policy data structure to maintain history of global pstates
+ *
+ * @highest_lpstate : the local pstate from which we are ramping down
+ * @elapsed_time : time in ms spent in ramping down from highest_lpstate
+ * @last_sampled_time : time from boot in ms when global pstates were last set
+ * @last_lpstate , last_gpstate : last set values for local and global pstates
+ * @timer : is used for ramping down if cpu goes idle for a long time with
+ *	global pstate held high
+ * @gpstate_lock : a spinlock to maintain synchronization between routines
+ *	called by the timer handler and governer's target_index calls
+ */
+struct global_pstate_info {
+	int highest_lpstate;
+	unsigned int elapsed_time;
+	unsigned int last_sampled_time;
+	int last_lpstate;
+	int last_gpstate;
+	spinlock_t gpstate_lock;
+	struct timer_list timer;
+};
+
+/*
+ * While resetting we don't want "timer" fields to be set to zero as we
+ * may lose track of timer and will not be able to cleanly remove it
+ */
+#define reset_gpstates(policy)   memset(policy->driver_data, 0,\
+					sizeof(struct global_pstate_info)-\
+					sizeof(struct timer_list)-\
+					sizeof(spinlock_t))
+
 static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
 static bool rebooting, throttled, occ_reset;
 
@@ -285,6 +331,7 @@ static inline void set_pmspr(unsigned long sprn, unsigned long val)
 struct powernv_smp_call_data {
 	unsigned int freq;
 	int pstate_id;
+	int gpstate_id;
 };
 
 /*
@@ -348,14 +395,17 @@ static void set_pstate(void *freq_data)
 	unsigned long val;
 	unsigned long pstate_ul =
 		((struct powernv_smp_call_data *) freq_data)->pstate_id;
+	unsigned long gpstate_ul =
+		((struct powernv_smp_call_data *) freq_data)->gpstate_id;
 
 	val = get_pmspr(SPRN_PMCR);
 	val = val & 0x0000FFFFFFFFFFFFULL;
 
 	pstate_ul = pstate_ul & 0xFF;
+	gpstate_ul = gpstate_ul & 0xFF;
 
 	/* Set both global(bits 56..63) and local(bits 48..55) PStates */
-	val = val | (pstate_ul << 56) | (pstate_ul << 48);
+	val = val | (gpstate_ul << 56) | (pstate_ul << 48);
 
 	pr_debug("Setting cpu %d pmcr to %016lX\n",
 			raw_smp_processor_id(), val);
@@ -425,6 +475,109 @@ next:
 }
 
 /*
+ * calcuate_global_pstate:
+ *
+ * @elapsed_time : elapsed time in milliseconds
+ * @local_pstate : new local pstate
+ * @highest_lpstate : pstate from which its ramping down
+ *
+ * Finds the appropriate global pstate based on the pstate from which its
+ * ramping down and the time elapsed in ramping down. It follows a quadratic
+ * equation which ensures that it reaches ramping down to pmin in 5sec.
+ */
+inline int calculate_global_pstate(unsigned int elapsed_time,
+		int highest_lpstate, int local_pstate)
+{
+	int pstate_diff;
+
+	/*
+	 * Using ramp_down_percent we get the percentage of rampdown
+	 * that we are expecting to be dropping. Difference between
+	 * highest_lpstate and powernv_pstate_info.min will give a absolute
+	 * number of how many pstates we will drop eventually by the end of
+	 * 5 seconds, then just scale it get the number pstates to be dropped.
+	 */
+	pstate_diff =  ((int)ramp_down_percent(elapsed_time) *
+			(highest_lpstate - powernv_pstate_info.min))/100;
+
+	/* Ensure that global pstate is >= to local pstate */
+	if (highest_lpstate - pstate_diff < local_pstate)
+		return local_pstate;
+	else
+		return (highest_lpstate - pstate_diff);
+}
+
+inline int queue_gpstate_timer(struct global_pstate_info *gpstates)
+{
+	unsigned int timer_interval;
+
+	/* Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But
+	 * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time.
+	 * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME
+	 * seconds of ramp down time.
+	 */
+	if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
+							 > MAX_RAMP_DOWN_TIME)
+		timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time;
+	else
+		timer_interval = GPSTATE_TIMER_INTERVAL;
+
+	return  mod_timer_pinned(&(gpstates->timer), jiffies +
+				msecs_to_jiffies(timer_interval));
+}
+/*
+ * gpstate_timer_handler
+ *
+ * @data: pointer to cpufreq_policy on which timer was queued
+ *
+ * This handler brings down the global pstate closer to the local pstate
+ * according quadratic equation. Queues a new timer if it is still not equal
+ * to local pstate
+ */
+void gpstate_timer_handler(unsigned long data)
+{
+	struct cpufreq_policy *policy = (struct cpufreq_policy *) data;
+	struct global_pstate_info *gpstates = (struct global_pstate_info *)
+							policy->driver_data;
+	unsigned int time_diff = jiffies_to_msecs(jiffies)
+					- gpstates->last_sampled_time;
+	struct powernv_smp_call_data freq_data;
+	int ret;
+
+	ret = spin_trylock(&gpstates->gpstate_lock);
+	if (!ret)
+		return;
+
+	gpstates->last_sampled_time += time_diff;
+	gpstates->elapsed_time += time_diff;
+	freq_data.pstate_id = gpstates->last_lpstate;
+	if ((gpstates->last_gpstate == freq_data.pstate_id) ||
+			(gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
+		freq_data.gpstate_id = freq_data.pstate_id;
+		reset_gpstates(policy);
+		gpstates->highest_lpstate = freq_data.pstate_id;
+	} else {
+		freq_data.gpstate_id = calculate_global_pstate(
+			gpstates->elapsed_time, gpstates->highest_lpstate,
+			freq_data.pstate_id);
+	}
+
+	/* If local pstate is equal to global pstate, rampdown is over
+	 * So timer is not required to be queued.
+	 */
+	if (freq_data.gpstate_id != freq_data.pstate_id)
+		ret = queue_gpstate_timer(gpstates);
+
+	gpstates->last_gpstate = freq_data.gpstate_id;
+	gpstates->last_lpstate = freq_data.pstate_id;
+
+	/* Timer may get migrated to a different cpu on cpu hot unplug */
+	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
+	spin_unlock(&gpstates->gpstate_lock);
+}
+
+
+/*
  * powernv_cpufreq_target_index: Sets the frequency corresponding to
  * the cpufreq table entry indexed by new_index on the cpus in the
  * mask policy->cpus
@@ -432,23 +585,88 @@ next:
 static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
 					unsigned int new_index)
 {
+	int ret;
 	struct powernv_smp_call_data freq_data;
-
+	unsigned int cur_msec;
+	unsigned long flags;
+	struct global_pstate_info *gpstates = (struct global_pstate_info *)
+						policy->driver_data;
 	if (unlikely(rebooting) && new_index != get_nominal_index())
 		return 0;
 
 	if (!throttled)
 		powernv_cpufreq_throttle_check(NULL);
 
+	cur_msec = jiffies_to_msecs(get_jiffies_64());
+
+	/*spinlock taken*/
+	spin_lock_irqsave(&gpstates->gpstate_lock, flags);
 	freq_data.pstate_id = powernv_freqs[new_index].driver_data;
 
+	/*First time call */
+	if (!gpstates->last_sampled_time) {
+		freq_data.gpstate_id = freq_data.pstate_id;
+		gpstates->highest_lpstate = freq_data.pstate_id;
+		goto gpstates_done;
+	}
+
+	/*Ramp down*/
+	if (gpstates->last_gpstate > freq_data.pstate_id) {
+		gpstates->elapsed_time += cur_msec -
+					gpstates->last_sampled_time;
+		/* If its has been ramping down for more than 5seconds
+		 * we should be reseting all global pstate related data.
+		 * Set it equal to local pstate to start fresh.
+		 */
+		if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
+			freq_data.gpstate_id = freq_data.pstate_id;
+			reset_gpstates(policy);
+			gpstates->highest_lpstate = freq_data.pstate_id;
+			freq_data.gpstate_id = freq_data.pstate_id;
+		} else {
+		/* elaspsed_time is less than 5 seconds, continue to rampdown*/
+			freq_data.gpstate_id = calculate_global_pstate(
+			gpstates->elapsed_time,
+			gpstates->highest_lpstate, freq_data.pstate_id);
+
+		}
+
+	} else {
+		/*Ramp up*/
+		reset_gpstates(policy);
+		gpstates->highest_lpstate = freq_data.pstate_id;
+		freq_data.gpstate_id = freq_data.pstate_id;
+	}
+
+	/* If local pstate is equal to global pstate, rampdown is over
+	 * So timer is not required to be queued.
+	 */
+	if (freq_data.gpstate_id != freq_data.pstate_id)
+		ret = queue_gpstate_timer(gpstates);
+gpstates_done:
+	gpstates->last_sampled_time = cur_msec;
+	gpstates->last_gpstate = freq_data.gpstate_id;
+	gpstates->last_lpstate = freq_data.pstate_id;
+
 	/*
 	 * Use smp_call_function to send IPI and execute the
 	 * mtspr on target CPU.  We could do that without IPI
 	 * if current CPU is within policy->cpus (core)
 	 */
 	smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
+	spin_unlock_irqrestore(&gpstates->gpstate_lock, flags);
+	return 0;
+}
 
+static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	int base;
+	struct global_pstate_info *gpstates = (struct global_pstate_info *)
+						policy->driver_data;
+	base = cpu_first_thread_sibling(policy->cpu);
+	del_timer_sync(&gpstates->timer);
+	kfree(policy->driver_data);
+	pr_info("freed driver_data cpu %d\n", base);
 	return 0;
 }
 
@@ -456,6 +674,7 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
 	int base, i;
 	struct kernfs_node *kn;
+	struct global_pstate_info *gpstates;
 
 	base = cpu_first_thread_sibling(policy->cpu);
 
@@ -475,6 +694,21 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	} else {
 		kernfs_put(kn);
 	}
+	gpstates =  kzalloc(sizeof(struct global_pstate_info), GFP_KERNEL);
+	if (!gpstates) {
+		pr_err("Could not allocate global_pstate_info\n");
+		return -ENOMEM;
+	}
+	policy->driver_data = gpstates;
+
+	/* initialize timer */
+	init_timer_deferrable(&gpstates->timer);
+	gpstates->timer.data = (unsigned long) policy;
+	gpstates->timer.function = gpstate_timer_handler;
+	gpstates->timer.expires = jiffies +
+				msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
+
+	pr_info("Added global_pstate_info & timer for %d cpu\n", base);
 	return cpufreq_table_validate_and_show(policy, powernv_freqs);
 }
 
@@ -612,6 +846,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
 	.name		= "powernv-cpufreq",
 	.flags		= CPUFREQ_CONST_LOOPS,
 	.init		= powernv_cpufreq_cpu_init,
+	.exit		= powernv_cpufreq_cpu_exit,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= powernv_cpufreq_target_index,
 	.get		= powernv_cpufreq_get,
-- 
2.5.5