[PATCH v3 7/8] perf: Define PMU_TXN_READ interface

Thu Jul 23 09:19:16 AEST 2015

Peter Zijlstra [peterz at infradead.org] wrote:
| I've not woken up yet, and not actually fully read the email, but can
| you stuff the entire above chunk inside the IPI?
| 
| I think you could then actually optimize __perf_event_read() as well,
| because all these events should be on the same context, so no point in
| calling update_*time*() for every event or so.
| 

Do you mean something like this (will move the rename to a separate
patch before posting):
--

>From e8eddb5d3877ebdb3b71213a00aaa980f4010dd0 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev at linux.vnet.ibm.com>
Date: Tue, 7 Jul 2015 21:45:23 -0400
Subject: [PATCH 1/1] perf: Define PMU_TXN_READ interface

Define a new PERF_PMU_TXN_READ interface to read a group of counters
at once. Note that we use this interface with all PMUs.

PMUs that implement this interface use the ->read() operation to _queue_
the counters to be read and use ->commit_txn() to actually read all the
queued counters at once.

PMUs that don't implement PERF_PMU_TXN_READ ignore ->start_txn() and
->commit_txn() and continue to read counters one at a time.

Thanks to input from Peter Zijlstra.

Signed-off-by: Sukadev Bhattiprolu <sukadev at linux.vnet.ibm.com>
---

Changelog[v5]
	[Peter Zijlstra] Ensure the entire transaction happens on the same CPU.

Changelog[v4]
	[Peter Zijlstra] Add lockdep_assert_held() in perf_event_read_group()
---
 include/linux/perf_event.h |    1 +
 kernel/events/core.c       |   72 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 44bf05f..da307ad 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -169,6 +169,7 @@ struct perf_event;
 #define PERF_EVENT_TXN 0x1
 
 #define PERF_PMU_TXN_ADD  0x1		/* txn to add/schedule event on PMU */
+#define PERF_PMU_TXN_READ 0x2		/* txn to read event group from PMU */
 
 /**
  * pmu::capabilities flags
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a6bd09d..7177dd8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3174,12 +3174,8 @@ void perf_event_exec(void)
 	rcu_read_unlock();
 }
 
-/*
- * Cross CPU call to read the hardware event
- */
-static void __perf_event_read(void *info)
+static void __perf_event_read(struct perf_event *event, int update_ctx)
 {
-	struct perf_event *event = info;
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
@@ -3194,7 +3190,7 @@ static void __perf_event_read(void *info)
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	if (ctx->is_active) {
+	if (ctx->is_active && update_ctx) {
 		update_context_time(ctx);
 		update_cgrp_time_from_event(event);
 	}
@@ -3204,6 +3200,16 @@ static void __perf_event_read(void *info)
 	raw_spin_unlock(&ctx->lock);
 }
 
+/*
+ * Cross CPU call to read the hardware event
+ */
+static void __perf_event_read_ipi(void *info)
+{
+	struct perf_event *event = info;
+
+	__perf_event_read(event, 1);
+}
+
 static inline u64 perf_event_count(struct perf_event *event)
 {
 	if (event->pmu->count)
@@ -3220,7 +3226,7 @@ static void perf_event_read(struct perf_event *event)
 	 */
 	if (event->state == PERF_EVENT_STATE_ACTIVE) {
 		smp_call_function_single(event->oncpu,
-					 __perf_event_read, event, 1);
+					 __perf_event_read_ipi, event, 1);
 	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
 		struct perf_event_context *ctx = event->ctx;
 		unsigned long flags;
@@ -3765,6 +3771,36 @@ static void orphans_remove_work(struct work_struct *work)
 	put_ctx(ctx);
 }
 
+/*
+ * Use the transaction interface to read the group of events in @leader.
+ * PMUs like the 24x7 counters in Power, can use this to queue the events
+ * in the ->read() operation and perform the actual read in ->commit_txn.
+ *
+ * Other PMUs can ignore the ->start_txn and ->commit_txn and read each
+ * PMU directly in the ->read() operation.
+ */
+static int perf_event_read_group(struct perf_event *leader)
+{
+	int ret;
+	struct perf_event *sub;
+	struct pmu *pmu;
+	struct perf_event_context *ctx = leader->ctx;
+
+	lockdep_assert_held(&ctx->mutex);
+
+	pmu = leader->pmu;
+
+	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
+
+	__perf_event_read(leader, 1);
+	list_for_each_entry(sub, &leader->sibling_list, group_entry)
+		__perf_event_read(sub, 0);
+
+	ret = pmu->commit_txn(pmu);
+
+	return ret;
+}
+
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
 	u64 total = 0;
@@ -3794,7 +3830,17 @@ static int perf_read_group(struct perf_event *event,
 
 	lockdep_assert_held(&ctx->mutex);
 
-	count = perf_event_read_value(leader, &enabled, &running);
+	mutex_lock(&leader->child_mutex);
+
+	ret = perf_event_read_group(leader);
+	if (ret) {
+		mutex_unlock(&leader->child_mutex);
+		return ret;
+	}
+
+	count = perf_event_aggregate(leader, &enabled, &running);
+
+	mutex_unlock(&leader->child_mutex);
 
 	values[n++] = 1 + leader->nr_siblings;
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -3815,15 +3861,19 @@ static int perf_read_group(struct perf_event *event,
 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 		n = 0;
 
-		values[n++] = perf_event_read_value(sub, &enabled, &running);
+		mutex_lock(&sub->child_mutex);
+
+		values[n++] = perf_event_aggregate(sub, &enabled, &running);
+
+		mutex_unlock(&sub->child_mutex);
+
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
 
 		size = n * sizeof(u64);
 
-		if (copy_to_user(buf + ret, values, size)) {
+		if (copy_to_user(buf + ret, values, size))
 			return -EFAULT;
-		}
 
 		ret += size;
 	}
-- 
1.7.9.5