[PATCH 02/14] powerpc/vpa_dtl: Add interface to expose vpa dtl counters via perf

Wed Aug 20 21:53:09 AEST 2025

On 8/15/25 14:03, Athira Rajeev wrote:
> From: Kajol Jain <kjain at linux.ibm.com>
> 
> The pseries Shared Processor Logical Partition(SPLPAR) machines
> can retrieve a log of dispatch and preempt events from the
> hypervisor using data from Disptach Trace Log(DTL) buffer.
> With this information, user can retrieve when and why each dispatch &
> preempt has occurred. Added an interface to expose the Virtual Processor
> Area(VPA) DTL counters via perf.
> 
> The following events are available and exposed in sysfs:
> 
>   vpa_dtl/dtl_cede/ - Trace voluntary (OS initiated) virtual processor waits
>   vpa_dtl/dtl_preempt/ - Trace time slice preempts
>   vpa_dtl/dtl_fault/ - Trace virtual partition memory page faults.
>   vpa_dtl/dtl_all/ - Trace all (dtl_cede/dtl_preempt/dtl_fault)
> 
> Added interface defines supported event list, config fields for the
> event attributes and their corresponding bit values which are exported
> via sysfs. User could use the standard perf tool to access perf events
> exposed via vpa-dtl pmu.
> 
> The VPA DTL PMU counters do not interrupt on overflow or generate any
> PMI interrupts. Therefore, the kernel needs to poll the counters, added
> hrtimer code to do that. The timer interval can be provided by user via
> sample_period field in nano seconds. There is one hrtimer added per
> vpa-dtl pmu thread.
> 
> To ensure there are no other conflicting dtl users (example: debugfs dtl
> or /proc/powerpc/vcpudispatch_stats), interface added code to use
> "down_write_trylock" call to take the dtl_access_lock. The dtl_access_lock
> is defined in dtl.h file. Also added global reference count variable called
> "dtl_global_refc", to ensure dtl data can be captured per-cpu. Code also
> added global lock called "dtl_global_lock" to avoid race condition.
> 
> Signed-off-by: Kajol Jain <kjain at linux.ibm.com>
> ---
>   arch/powerpc/perf/Makefile  |   2 +-
>   arch/powerpc/perf/vpa-dtl.c | 349 ++++++++++++++++++++++++++++++++++++
>   2 files changed, 350 insertions(+), 1 deletion(-)
>   create mode 100644 arch/powerpc/perf/vpa-dtl.c
> 
> diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
> index 7f53fcb7495a..78dd7e25219e 100644
> --- a/arch/powerpc/perf/Makefile
> +++ b/arch/powerpc/perf/Makefile
> @@ -14,7 +14,7 @@ obj-$(CONFIG_PPC_POWERNV)	+= imc-pmu.o
>   obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o
>   obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o
>   
> -obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o
> +obj-$(CONFIG_HV_PERF_CTRS) += hv-24x7.o hv-gpci.o hv-common.o vpa-dtl.o
>   
>   obj-$(CONFIG_VPA_PMU) += vpa-pmu.o
>   
> diff --git a/arch/powerpc/perf/vpa-dtl.c b/arch/powerpc/perf/vpa-dtl.c
> new file mode 100644
> index 000000000000..e92756f88801
> --- /dev/null
> +++ b/arch/powerpc/perf/vpa-dtl.c
> @@ -0,0 +1,349 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Perf interface to expose Dispatch Trace Log counters.
> + *
> + * Copyright (C) 2024 Kajol Jain, IBM Corporation
> + */
> +
> +#ifdef CONFIG_PPC_SPLPAR
> +#define pr_fmt(fmt) "vpa_dtl: " fmt
> +
> +#include <asm/dtl.h>
> +#include <linux/perf_event.h>
> +#include <asm/plpar_wrappers.h>
> +
> +#define EVENT(_name, _code)     enum{_name = _code}
> +
> +/*
> + * Based on Power Architecture Platform Reference(PAPR) documentation,
> + * Table 14.14. Per Virtual Processor Area, below Dispatch Trace Log(DTL)
> + * Enable Mask used to get corresponding virtual processor dispatch
> + * to preempt traces:
> + *   DTL_CEDE(0x1): Trace voluntary (OS initiated) virtual
> + *   processor waits
> + *   DTL_PREEMPT(0x2): Trace time slice preempts
> + *   DTL_FAULT(0x4): Trace virtual partition memory page
> + faults.
> + *   DTL_ALL(0x7): Trace all (DTL_CEDE | DTL_PREEMPT | DTL_FAULT)
> + *
> + * Event codes based on Dispatch Trace Log Enable Mask.
> + */
> +EVENT(DTL_CEDE,         0x1);
> +EVENT(DTL_PREEMPT,      0x2);
> +EVENT(DTL_FAULT,        0x4);
> +EVENT(DTL_ALL,          0x7);
> +
> +GENERIC_EVENT_ATTR(dtl_cede, DTL_CEDE);
> +GENERIC_EVENT_ATTR(dtl_preempt, DTL_PREEMPT);
> +GENERIC_EVENT_ATTR(dtl_fault, DTL_FAULT);
> +GENERIC_EVENT_ATTR(dtl_all, DTL_ALL);
> +
> +PMU_FORMAT_ATTR(event, "config:0-7");
> +
> +static struct attribute *events_attr[] = {
> +	GENERIC_EVENT_PTR(DTL_CEDE),
> +	GENERIC_EVENT_PTR(DTL_PREEMPT),
> +	GENERIC_EVENT_PTR(DTL_FAULT),
> +	GENERIC_EVENT_PTR(DTL_ALL),
> +	NULL
> +};
> +
> +static struct attribute_group event_group = {
> +	.name = "events",
> +	.attrs = events_attr,
> +};
> +
> +static struct attribute *format_attrs[] = {
> +	&format_attr_event.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group format_group = {
> +	.name = "format",
> +	.attrs = format_attrs,
> +};
> +
> +static const struct attribute_group *attr_groups[] = {
> +	&format_group,
> +	&event_group,
> +	NULL,
> +};
> +
> +struct vpa_dtl {
> +	struct dtl_entry	*buf;
> +	u64			last_idx;
> +	bool			active_lock;

How is this active_lock being used?
I see it is set/unset, but couldn't figure out how it is used.

> +};
> +
> +static DEFINE_PER_CPU(struct vpa_dtl, vpa_dtl_cpu);
> +
> +/* variable to capture reference count for the active dtl threads */
> +static int dtl_global_refc;
> +static spinlock_t dtl_global_lock = __SPIN_LOCK_UNLOCKED(dtl_global_lock);
> +
> +/*
> + * Function to dump the dispatch trace log buffer data to the
> + * perf data.
> + */
> +static void vpa_dtl_dump_sample_data(struct perf_event *event)
> +{
> +	return;
> +}
> +
> +/*
> + * The VPA Dispatch Trace log counters do not interrupt on overflow.
> + * Therefore, the kernel needs to poll the counters to avoid missing
> + * an overflow using hrtimer. The timer interval is based on sample_period
> + * count provided by user, and minimum interval is 1 millisecond.
> + */
> +static enum hrtimer_restart vpa_dtl_hrtimer_handle(struct hrtimer *hrtimer)
> +{
> +	struct perf_event *event;
> +	u64 period;
> +
> +	event = container_of(hrtimer, struct perf_event, hw.hrtimer);
> +
> +	if (event->state != PERF_EVENT_STATE_ACTIVE)
> +		return HRTIMER_NORESTART;
> +
> +	vpa_dtl_dump_sample_data(event);
> +	period = max_t(u64, NSEC_PER_MSEC, event->hw.sample_period);
> +	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
> +
> +	return HRTIMER_RESTART;
> +}
> +
> +static void vpa_dtl_start_hrtimer(struct perf_event *event)
> +{
> +	u64 period;
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	period = max_t(u64, NSEC_PER_MSEC, hwc->sample_period);
> +	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static void vpa_dtl_stop_hrtimer(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +
> +	hrtimer_cancel(&hwc->hrtimer);
> +}
> +
> +static void vpa_dtl_reset_global_refc(struct perf_event *event)
> +{
> +	spin_lock(&dtl_global_lock);
> +	dtl_global_refc--;
> +	if (dtl_global_refc <= 0) {
> +		dtl_global_refc = 0;
> +		up_write(&dtl_access_lock);
> +	}
> +	spin_unlock(&dtl_global_lock);
> +}
> +
> +/* Allocate dtl buffer memory for given cpu. */

The above comment is self explainable, may not be needed.

> +static int vpa_dtl_mem_alloc(int cpu)
> +{
> +	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, cpu);
> +	struct dtl_entry *buf = NULL;
> +
> +	/* Check for dispatch trace log buffer cache */
> +	if (!dtl_cache)
> +		return -ENOMEM;
> +
> +	buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(cpu));

You probably need GFP_ATOMIC here, since this is called when spinlocks are held.

> +	if (!buf) {
> +		pr_warn("buffer allocation failed for cpu %d\n", cpu);
> +		return -ENOMEM;
> +	}
> +	dtl->buf = buf;
> +	return 0;
> +}
> +
> +static int vpa_dtl_event_init(struct perf_event *event)
> +{
> +	struct hw_perf_event *hwc = &event->hw;
> +	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
> +
> +	/* test the event attr type for PMU enumeration */
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	if (!perfmon_capable())
> +		return -EACCES;
> +
> +	/* Return if this is a counting event */
> +	if (!is_sampling_event(event))
> +		return -EOPNOTSUPP;
> +
> +	/* no branch sampling */
> +	if (has_branch_stack(event))
> +		return -EOPNOTSUPP;
> +
> +	/* Invalid eventcode */
> +	switch (event->attr.config) {
> +	case DTL_LOG_CEDE:
> +	case DTL_LOG_PREEMPT:
> +	case DTL_LOG_FAULT:
> +	case DTL_LOG_ALL:
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	spin_lock(&dtl_global_lock);
> +
> +	/*
> +	 * To ensure there are no other conflicting dtl users
> +	 * (example: /proc/powerpc/vcpudispatch_stats or debugfs dtl),
> +	 * below code try to take the dtl_access_lock.
> +	 * The dtl_access_lock is a rwlock defined in dtl.h, which is used
> +	 * to unsure there is no conflicting dtl users.
> +	 * Based on below code, vpa_dtl pmu tries to take write access lock
> +	 * and also checks for dtl_global_refc, to make sure that the
> +	 * dtl_access_lock is taken by vpa_dtl pmu interface.
> +	 */
> +	if (dtl_global_refc == 0 && !down_write_trylock(&dtl_access_lock)) {
> +		spin_unlock(&dtl_global_lock);
> +		return -EBUSY;
> +	}
> +
> +	/* Allocate dtl buffer memory */
> +	if (vpa_dtl_mem_alloc(event->cpu)) {
> +		spin_unlock(&dtl_global_lock);
> +		return -ENOMEM;
> +	}
> +
> +	/*
> +	 * Increment the number of active vpa_dtl pmu threads. The
> +	 * dtl_global_refc is used to keep count of cpu threads that
> +	 * currently capturing dtl data using vpa_dtl pmu interface.
> +	 */
> +	dtl_global_refc++;
> +
> +	/*
> +	 * active_lock is a per cpu variable which is set if
> +	 * current cpu is running vpa_dtl perf record session.
> +	 */
> +	dtl->active_lock = true;
> +	spin_unlock(&dtl_global_lock);
> +
> +	hrtimer_setup(&hwc->hrtimer, vpa_dtl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +
> +	/*
> +	 * Since hrtimers have a fixed rate, we can do a static freq->period
> +	 * mapping and avoid the whole period adjust feedback stuff.
> +	 */

I didn't get this comment. What is meant by hrtimers have fixed rate? You can adjust the
the period value for next expiry always.

> +	if (event->attr.freq) {
> +		long freq = event->attr.sample_freq;
> +
> +		event->attr.sample_period = NSEC_PER_SEC / freq;
> +		hwc->sample_period = event->attr.sample_period;
> +		local64_set(&hwc->period_left, hwc->sample_period);
> +		hwc->last_period = hwc->sample_period;
> +		event->attr.freq = 0;
> +	}

I am not very familiar with PMU stuff.
What does the above do? what is period_left?

> +
> +	event->destroy = vpa_dtl_reset_global_refc;
> +	return 0;
> +}
> +
> +static int vpa_dtl_event_add(struct perf_event *event, int flags)
> +{
> +	int ret, hwcpu;
> +	unsigned long addr;
> +	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
> +
> +	/*
> +	 * Register our dtl buffer with the hypervisor. The
> +	 * HV expects the buffer size to be passed in the second
> +	 * word of the buffer. Refer section '14.11.3.2. H_REGISTER_VPA'
> +	 * from PAPR for more information.
> +	 */
> +	((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES);
> +	dtl->last_idx = 0;
> +
> +	hwcpu = get_hard_smp_processor_id(event->cpu);
> +	addr = __pa(dtl->buf);
> +
> +	ret = register_dtl(hwcpu, addr);
> +	if (ret) {
> +		pr_warn("DTL registration for cpu %d (hw %d) failed with %d\n",
> +			event->cpu, hwcpu, ret);
> +		return ret;
> +	}
> +
> +	/* set our initial buffer indices */
> +	lppaca_of(event->cpu).dtl_idx = 0;
> +
> +	/*
> +	 * Ensure that our updates to the lppaca fields have
> +	 * occurred before we actually enable the logging
> +	 */
> +	smp_wmb();
> +
> +	/* enable event logging */
> +	lppaca_of(event->cpu).dtl_enable_mask = event->attr.config;
> +
> +	vpa_dtl_start_hrtimer(event);
> +
> +	return 0;
> +}
> +
> +static void vpa_dtl_event_del(struct perf_event *event, int flags)
> +{
> +	int hwcpu = get_hard_smp_processor_id(event->cpu);
> +	struct vpa_dtl *dtl = &per_cpu(vpa_dtl_cpu, event->cpu);
> +
> +	vpa_dtl_stop_hrtimer(event);
> +	unregister_dtl(hwcpu);
> +	kmem_cache_free(dtl_cache, dtl->buf);
> +	dtl->buf = NULL;
> +	lppaca_of(event->cpu).dtl_enable_mask = 0x0;
> +	dtl->active_lock = false;
> +}
> +
> +/*
> + * This function definition is empty as vpa_dtl_dump_sample_data
> + * is used to parse and dump the dispatch trace log data,
> + * to perf data.
> + */
> +static void vpa_dtl_event_read(struct perf_event *event)
> +{
> +}
> +
> +static struct pmu vpa_dtl_pmu = {
> +	.task_ctx_nr = perf_invalid_context,
> +
> +	.name = "vpa_dtl",
> +	.attr_groups = attr_groups,
> +	.event_init  = vpa_dtl_event_init,
> +	.add         = vpa_dtl_event_add,
> +	.del         = vpa_dtl_event_del,
> +	.read        = vpa_dtl_event_read,
> +	.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_EXCLUSIVE,
> +};
> +
> +static int vpa_dtl_init(void)
> +{
> +	int r;
> +
> +	if (!firmware_has_feature(FW_FEATURE_SPLPAR)) {
> +		pr_debug("not a shared virtualized system, not enabling\n");
> +		return -ENODEV;
> +	}
> +
> +	/* This driver is intended only for L1 host. */
> +	if (is_kvm_guest()) {
> +		pr_debug("Only supported for L1 host system\n");
> +		return -ENODEV;
> +	}
> +
> +	r = perf_pmu_register(&vpa_dtl_pmu, vpa_dtl_pmu.name, -1);
> +	if (r)
> +		return r;
> +
> +	return 0;
> +}
> +
> +device_initcall(vpa_dtl_init);
> +#endif //CONFIG_PPC_SPLPAR