[Skiboot] [PATCH v12 07/10] skiboot: Find the IMC DTB

Madhavan Srinivasan maddy at linux.vnet.ibm.com
Thu Jun 15 00:36:50 AEST 2017



On Wednesday 14 June 2017 09:47 AM, Michael Neuling wrote:
> On Sun, 2017-05-21 at 20:40 +0530, Madhavan Srinivasan wrote:
>>> From: Hemant Kumar <hemant at linux.vnet.ibm.com>
>> IMC (In Memory Collection) catalog is a repository of information
>> about the Performance Monitoring Units (PMUs) and their events under
>> the IMC infrastructure. The information include :
>>   - The PMU names
>>   - Event names
>>   - Event description
>>   - Event offsets
>>   - Event scale
>>   - Event unit
>>
>> The catalog is provided as a flattened device tree (dtb). Processors
>> with different PVR values may have different PMU or event names. Hence,
>> for each processor, there can be multiple device tree binaries (dtbs)
>> containing the IMC information. Each of the dtb is compressed and forms
>> a sub-partition inside the PNOR partition "IMA_CATALOG". Here is a link
>> to the commit adding this partition to PNOR :
>> https://github.com/open-power/pnor/commit/c940142c6dc64dd176096dc648f433c889919e84
>>
>> So, each compressed dtb forms a sub-partition inside the IMC pnor
>> partition and can be accessed/loaded through a sub-partition id which
>> is nothing but the PVR id. Based on the current processor's PVR, the
>> appropriate sub-partion will be loaded.
>>
>> Note however, that the catalog information is in the form of a dtb and
>> the dtb is compressed too. So, the sub-partition loaded must be
>> decompressed first before we can actually use it.
>>
>> It is important to mention here that while a PNOR image built for one
>> processor is specific to only that processor and isn't portable, a
>> single system generation (Processor version) may have multiple revisions
>> and these revisions may have some changes in their IMC PMUs and events,
>> and hence, the need for multiple IMC DTBs.
>>
>> The sub-partition that we obtain from the IMC pnor partition is a
>> compressed device tree binary. We uncompress it using the libxz's
>> functions. After uncompressing it, we link the device tree binary to the
>> system's device tree. The kernel can now access the device tree and get
>> the IMC PMUs and their events' information.
>>
>> Not all the IMC PMUs listed in the device tree may be available. This is
>> indicated by imc availability vector (which is a part of the IMC control
>> block structure). We need to check this vector and make sure to remove
>> the IMC device nodes which are unavailable.
>>
>>> Signed-off-by: Hemant Kumar <hemant at linux.vnet.ibm.com>
>>> Signed-off-by: Anju T Sudhakar <anju at linux.vnet.ibm.com>
>>> Signed-off-by: Madhavan Srinivasan <maddy at linux.vnet.ibm.com>
>> ---
>>   core/flash.c       |   1 +
>>   core/init.c        |   7 ++
>>   hw/Makefile.inc    |   2 +-
>>   hw/imc.c           | 338 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   include/imc.h      |   2 +
>>   include/opal-api.h |  11 ++
>>   include/platform.h |   1 +
>>   7 files changed, 361 insertions(+), 1 deletion(-)
>>   create mode 100644 hw/imc.c
>>
>> diff --git a/core/flash.c b/core/flash.c
>> index 793401c94615..e5f8452a17c7 100644
>> --- a/core/flash.c
>> +++ b/core/flash.c
>> @@ -421,6 +421,7 @@ static struct {
>>>>>   	{ RESOURCE_ID_KERNEL,	RESOURCE_SUBID_NONE,		"BOOTKERNEL" },
>>>>   	{ RESOURCE_ID_INITRAMFS,RESOURCE_SUBID_NONE,		"ROOTFS" },
>>>>>   	{ RESOURCE_ID_CAPP,	RESOURCE_SUBID_SUPPORTED,	"CAPP" },
>>>> +	{ RESOURCE_ID_CATALOG,  RESOURCE_SUBID_SUPPORTED,	"IMA_CATALOG" },
>>   };
>>   
>>   
>> diff --git a/core/init.c b/core/init.c
>> index dce10fd6a20b..d9f359a5b033 100644
>> --- a/core/init.c
>> +++ b/core/init.c
>> @@ -48,6 +48,7 @@
>>   #include <libstb/stb.h>
>>   #include <libstb/container.h>
>>   #include <phys-map.h>
>> +#include <imc.h>
>>   
>>   enum proc_gen proc_gen;
>>   unsigned int pcie_max_link_speed;
>> @@ -982,6 +983,9 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>>>   	/* Read in NVRAM and set it up */
>>>   	nvram_init();
>>   
>>> +	/* preload the IMC catalog dtb */
>>> +	imc_catalog_preload();
>> +
>>>   	/* Set the console level */
>>>   	console_log_level();
>>   
>> @@ -1005,6 +1009,9 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>>>   	/* NX init */
>>>   	nx_init();
>>   
>>> +	/* Init In-Memory Collection related stuff (load the IMC dtb into memory) */
>>> +	imc_init();
>> +
>>>   	/* Probe IO hubs */
>>>   	probe_p7ioc();
>>   
>> diff --git a/hw/Makefile.inc b/hw/Makefile.inc
>> index 97080aad17c3..13b085e82795 100644
>> --- a/hw/Makefile.inc
>> +++ b/hw/Makefile.inc
>> @@ -1,7 +1,7 @@
>>   # -*-Makefile-*-
>>   SUBDIRS += hw
>>   HW_OBJS  = xscom.o chiptod.o gx.o cec.o lpc.o lpc-uart.o psi.o
>> -HW_OBJS += homer.o slw.o occ.o fsi-master.o centaur.o
>> +HW_OBJS += homer.o slw.o occ.o fsi-master.o centaur.o imc.o
>>   HW_OBJS += nx.o nx-rng.o nx-crypto.o nx-842.o
>>   HW_OBJS += p7ioc.o p7ioc-inits.o p7ioc-phb.o
>>   HW_OBJS += phb3.o sfc-ctrl.o fake-rtc.o bt.o p8-i2c.o prd.o
>> diff --git a/hw/imc.c b/hw/imc.c
>> new file mode 100644
>> index 000000000000..00e19154d641
>> --- /dev/null
>> +++ b/hw/imc.c
>> @@ -0,0 +1,338 @@
>> +/* Copyright 2016 IBM Corp.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at
>> + *
>> + *     http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>> + * implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#include <skiboot.h>
>> +#include <xscom.h>
>> +#include <imc.h>
>> +#include <chip.h>
>> +#include <libxz/xz.h>
>> +#include <device.h>
>> +
>> +/*
>> + * Nest IMC PMU names along with their bit values as represented in the
>> + * imc_chip_avl_vector(in struct imc_chip_cb, look at include/imc.h).
>> + * nest_pmus[] is an array containing all the possible nest IMC PMU node names.
>> + */
>> +char const *nest_pmus[] = {
>>> +	"powerbus0",
>>> +	"mcs0",
>>> +	"mcs1",
>>> +	"mcs2",
>>> +	"mcs3",
>>> +	"mcs4",
>>> +	"mcs5",
>>> +	"mcs6",
>>> +	"mcs7",
>>> +	"mba0",
>>> +	"mba1",
>>> +	"mba2",
>>> +	"mba3",
>>> +	"mba4",
>>> +	"mba5",
>>> +	"mba6",
>>> +	"mba7",
>>> +	"cen0",
>>> +	"cen1",
>>> +	"cen2",
>>> +	"cen3",
>>> +	"cen4",
>>> +	"cen5",
>>> +	"cen6",
>>> +	"cen7",
>>> +	"xlink0",
>>> +	"xlink1",
>>> +	"xlink2",
>>> +	"mcd0",
>>> +	"mcd1",
>>> +	"phb0",
>>> +	"phb1",
>>> +	"phb2",
>>> +	"resvd",
>>> +	"nx",
>>> +	"capp0",
>>> +	"capp1",
>>> +	"vas",
>>> +	"int",
>>> +	"alink0",
>>> +	"alink1",
>>> +	"alink2",
>>> +	"nvlink0",
>>> +	"nvlink1",
>>> +	"nvlink2",
>>> +	"nvlink3",
>>> +	"nvlink4",
>>> +	"nvlink5",
>>> +	/* reserved bits : 48 - 64 */
>> +};
>> +
>> +char *imc_catalog_buf;
>> +size_t imc_catalog_size;
>> +const char **imc_prop_to_fix(struct dt_node *node);
>> +const char *prop_to_fix[] = {"events", NULL};
>> +
>> +static struct imc_chip_cb *get_imc_cb(void)
>> +{
>>> +	uint64_t cb_loc;
>>> +	struct proc_chip *chip = get_chip(this_cpu()->chip_id);
>> +
>> +	cb_loc = chip->homer_base + P9_CB_STRUCT_OFFSET;
> We are sure the homer_base has been inited by now and is correct?

As per the microcode team the homer region control block will be
initialized at the system boot time with the proper value. That said,
I agree it is better to have a check here. Will add one.


>
>> +	return (struct imc_chip_cb *)cb_loc;
>> +}
>> +
>> +/* helper to get number of chips in the system */
>> +static inline int nr_chips(void)
> This would be better somewhere else like chip.h

Sure, i can move it out.

>
>> +{
>>> +	struct proc_chip *chip;
>>> +	int nr_chips = 0;
>> +
>>> +	for_each_chip(chip)
>>> +		nr_chips++;
>> +
>>> +	return nr_chips;
>> +}
>> +
>> +/*
>> + * Decompresses the blob obtained from the IMC pnor sub-partition
>> + * in "buf" of size "size", assigns the uncompressed device tree
>> + * binary to "fdt" and returns.
>> + *
>> + * Returns 0 on success and -1 on error.
>> + *
>> + * TODO: Ideally this should be part of generic subpartition load
>> + * infrastructure. And decompression can be queued as another CPU job
>> + */
>> +static int decompress_subpartition(char *buf, size_t size, void **fdt)
> fdt param makes me think this has something to do with device tree but there is
> nothing device tree related here.  Can you just make it more like memcpy.

Ok What we have in the pnor partition is multiple device tree blob
in a compressed form and bundled together. Something similar to
capp ucode. So in the preload function we provide the pvr
as the subid to select the right compressed device tree blob
to this system. This is what is passed to this function. So when
we decompressed it, we get a device tree blob. So we named it
that.

> decompress(void *dst, size_t dst_size, void *src, size_t src_size)
>
> then you don't need to assume the malloc size either (ie
> MAX_UNCOMPRESSED_IMC_DTB_SIZE)

Yes we would do this something in the lines of memcpy.

>
>> +{
>>> +	struct xz_dec *s;
>>> +	struct xz_buf b;
>>> +	void *data;
>>> +	int ret = 0;
>> +
>>> +	/* Initialize the xz library first */
>>> +	xz_crc32_init();
>>> +	s = xz_dec_init(XZ_SINGLE, 0);
>>> +	if (s == NULL) {
>>> +		prerror("IMC: initialization error for xz\n");
>>> +		return -1;
>>> +	}
>> +
>>> +	/* Allocate memory for the uncompressed data */
>>> +	data = malloc(MAX_COMPRESSED_IMC_DTB_SIZE);
>>> +	if (!data) {
>>> +		prerror("IMC: memory allocation error\n");
>>> +		ret = -1;
>>> +		goto err;
>>> +	}
>> +
>>> +	/*
>>> +	 * Source address : buf
>>> +	 * Source size : size
>>> +	 * Destination address : data
>>> +	 * Destination size : MAX_UNCOMPRESSED_IMC_DTB_SIZE
>>> +	 */
>>> +	b.in = buf;
>>> +	b.in_pos = 0;
>>> +	b.in_size = size;
>>> +	b.out = data;
>>> +	b.out_pos = 0;
>>> +	b.out_size = MAX_UNCOMPRESSED_IMC_DTB_SIZE;
>> +
>>> +	/* Start decompressing */
>>> +	ret = xz_dec_run(s, &b);
>>> +	if (ret != XZ_STREAM_END) {
>>> +		prerror("IMC: failed to decompress subpartition\n");
>>> +		free(data);
>> +		ret = -1;
> Should we goto err here rather than also changing *fdt?

Yes true. Nice catch. Will fix it.


>
>> +	}
>> +	*fdt = data;
> I really don't like fdt as the name here.

OK will change it in the lines of memcpy suggestion.

>
>> +
>> +err:
>>> +	/* Clean up memory */
>>> +	xz_dec_end(s);
>>> +	return ret;
>> +}
>> +
>> +/*
>> + * Function return list of properties names for the fixup
>> + */
>> +const char **imc_prop_to_fix(struct dt_node *node)
>> +{
>>> +	if (dt_node_is_compatible(node, "ibm,imc-counters"))
>>> +		return prop_to_fix;
>> +
>>> +	return NULL;
>> +}
>> +
>> +/*
>> + * Remove the PMU device nodes from the ncoming new subtree, if they are not
> incoming
>
>> + * available in the hardware. The availability is described by the
>> + * control block's imc_chip_avl_vector.
>> + * Each bit represents a device unit. If the device is available, then
>> + * the bit is set else its unset.
>> + */
>> +static int disable_unavailable_units(struct dt_node *dev)
>> +{
>>> +	uint64_t avl_vec;
>>> +	struct imc_chip_cb *cb;
>>> +	struct dt_node *target;
>>> +	int i;
>> +
>>> +	/* Fetch the IMC control block structure */
>>> +	cb = get_imc_cb();
>> +
>> +	avl_vec = be64_to_cpu(cb->imc_chip_avl_vector);
>>> +	for (i = 0; i < MAX_NEST_UNITS; i++) {
>>> +		if (!(PPC_BITMASK(i, i) & avl_vec)) {
>>> +			/* Check if the device node exists */
>>> +			target = dt_find_by_name(dev, nest_pmus[i]);
>>> +			if (!target)
>>> +				continue;
>>> +			/* Remove the device node */
>>> +			dt_free(target);
>>> +		}
>>> +	}
>> +
>>> +	return 0;
>> +}
>> +
>> +/*
>> + * Function to queue the loading of imc catalog data
>> + * from the IMC pnor partition.
>> + */
>> +void imc_catalog_preload(void)
>> +{
>>> +	uint32_t pvr = mfspr(SPR_PVR);
>>> +	int ret = OPAL_SUCCESS;
>> +	imc_catalog_size = MAX_UNCOMPRESSED_IMC_DTB_SIZE;
> Should we sanity check MAX_UNCOMPRESSED_IMC_DTB_SIZE vs the FFS partition size.

Just trying to understand how this check will help. Infact going forward
if they reduce the pnor partition side we will break right? Is that the
indent to have the check?

>
> Also, what's the process here?  We download the parition in to imc_catalog_buf

Previously we had the preload of the partition resource and 
wait_for_resource_load
calls in the same function, in which case we were suggested to split it
and have it is two different functions. That way we could queue the
pre-loading part.

>
>> +
>>> +	/* Enable only for power 9 */
>>> +	if (proc_gen != proc_gen_p9)
>>> +		return;
>> +
>>> +	imc_catalog_buf = malloc(MAX_UNCOMPRESSED_IMC_DTB_SIZE);
>>> +	if (!imc_catalog_buf) {
>>> +		prerror("IMC: Memory allocation for catalog failed\n");
>>> +		return;
>>> +	}
>> +
>>> +	ret = start_preload_resource(RESOURCE_ID_CATALOG,
>>> +					pvr, imc_catalog_buf, &imc_catalog_size);
>>> +	if (ret != OPAL_SUCCESS)
>>> +		free(imc_catalog_buf);
>> +
>>> +	return;
>> +}
>> +
>> +static bool is_nest_node(struct dt_node *node)
>> +{
>>> +	const struct dt_property *type = dt_find_property(node, "type");
>>> +	u32 val=0;
>> +
>>> +	if (type) {
>>> +		val = dt_prop_get_u32(node, "type");
>>> +		if (val == IMC_COUNTER_PER_CHIP)
>>> +			return true;
>>> +	}
>> +
>>> +	return false;
>> +}
>> +
>> +static void imc_dt_update_nest_node(struct dt_node *dev)
>> +{
>>> +	struct proc_chip *chip;
>>> +	uint64_t *base_addr = NULL;
>>> +	uint32_t *chipids = NULL;
>>> +	int i=0, nr_chip = nr_chips();
>>> +	struct dt_node *node;
>>> +	const struct dt_property *type;
>> +
>>> +	/* Add the base_addr and chip-id properties for the nest node */
>>> +	base_addr = malloc(sizeof(uint64_t) * nr_chip);
>>> +	chipids = malloc(sizeof(uint32_t) * nr_chip);
>>> +	for_each_chip(chip) {
>>> +		base_addr[i] = chip->homer_base;
>>> +		chipids[i] = chip->id;
>>> +		i++;
>>> +	}
>> +
>>> +	dt_for_each_compatible(dev, node, "ibm,imc-counters") {
>>> +		type = dt_find_property(node, "type");
>>> +		if (type && is_nest_node(node)) {
>>> +			dt_add_property(node, "base-addr", base_addr, (i * sizeof(u64)));
>>> +			dt_add_property(node, "chip-id", chipids, (i * sizeof(u32)));
>>> +		}
>>> +	}
>> +}
>> +
>> +/*
>> + * Load the IMC pnor partition and find the appropriate sub-partition
>> + * based on the platform's PVR.
>> + * Decompress the sub-partition and link the imc device tree to the
>> + * existing device tree.
>> + */
>> +void imc_init(void)
>> +{
>> +	void *fdt = NULL;
> No need to init this.

ok.

>
>> +	uint32_t pvr = mfspr(SPR_PVR);
>>> +	struct dt_node *dev;
>>> +	int ret;
>> +
>>> +	/* Enable only for power 9 */
>>> +	if (proc_gen != proc_gen_p9)
>>> +		return;
>> +
>> +	ret = wait_for_resource_loaded(RESOURCE_ID_CATALOG, pvr);
> Should we be masking out some of the PVR here?  Some of the bits in the PVR are
> just for normal vs fused core.  I assume those catalogs would be the same

Yes. I got to know that yesterday in our internal call. I will mask it out.
Yes the catalog is same for both fused vs normal core.

>
>> +	if (ret != OPAL_SUCCESS) {
>>> +		prerror("IMC Catalog load failed\n");
>>> +		return;
>>> +	}
>> +
>>> +	/* Decompress the subpartition now */
>>> +	ret = decompress_subpartition(imc_catalog_buf, imc_catalog_size, &fdt);
>>> +	if (ret < 0)
>> +		goto err;
> Can you explain the flow here from PNOR -> device tree.  It seems to go:
>
>      PNOR ->
>      compressed local buffer ->
>      decompressed local buffer ->
>      attach decompressed buffer to device tree and free compressed local buffer

Yes I will document the flow.

>> +
>>> +	/* Create a device tree entry for imc counters */
>>> +	dev = dt_new_root("imc-counters");
>>> +	if (!dev)
>>> +		goto err;
>> +
>> +	/* Attach the new fdt to the imc-counters node */
> This also sanity checks the device tree we get from flash?

yes good point. I just checked the dt_expand_node function
we dont do any checked for the fdt parameter there. I will
add one here.

>
>> +	ret = dt_expand_node(dev, fdt, 0);
>>> +	if (ret < 0) {
>>> +		dt_free(dev);
>>> +		goto err;
>>> +	}
>> +
>> +	dt_adjust_subtree_phandle(dev, imc_prop_to_fix);
> This is to make the phandles in the new device tree unique for the device tree
> we are importing it into?  A comment would be nice.

Yes definitely will add it.

>
>> +
>>> +	/* Check availability of the Nest PMU units from the availability vector */
>>> +	disable_unavailable_units(dev);
>> +
>>> +	/* Update the base_addr and chip-id for nest nodes */
>>> +	imc_dt_update_nest_node(dev);
>> +
>>> +	if (!dt_attach_root(dt_root, dev)) {
>>> +		dt_free(dev);
>>> +		goto err;
>> +	}
> So if this fails, we are assuming the OS will never call the OPAL calls since it
> won't see the device tree?
>
> If the OS does, what fall back to do have or do we consider the OS broken at
> that point?

Yes if this fails we will remove the imc-counters  from the device tree.
And in the kernel, we have the platform autoload function which will
look for the compatibility field to load the IMC drivers.  So I am not sure
whether we should consider a scenario of IMC driver getting loaded
without the IMC device tree nodes. But still if that happens we dont
have a fall back at this point.

>> +
>>> +	free(imc_catalog_buf);
>>> +	return;
>> +err:
>>> +	prerror("IMC Devices not added\n");
>>> +	free(imc_catalog_buf);
>> +}
>> diff --git a/include/imc.h b/include/imc.h
>> index 494e71fcac9a..c7ad5fa933b7 100644
>> --- a/include/imc.h
>> +++ b/include/imc.h
>> @@ -116,4 +116,6 @@ struct imc_chip_cb
>>   
>>>   #define MAX_NEST_UNITS			48
>>   
>> +void imc_init(void);
>> +void imc_catalog_preload(void);
>>   #endif /* __IMC_H */
>> diff --git a/include/opal-api.h b/include/opal-api.h
>> index 80033c6fa77f..44276553b818 100644
>> --- a/include/opal-api.h
>> +++ b/include/opal-api.h
>> @@ -1215,6 +1215,17 @@ enum {
>>>>   	XIVE_DUMP_EMU_STATE	= 5,
>>   };
>>   
>> +/* In-Memory Collection Counters Type */
>> +enum {
>>>> +	IMC_COUNTER_PER_THREAD		= 0x1,
>>>> +	IMC_COUNTER_PER_SUB_CORE	= 0x2,
>>>> +	IMC_COUNTER_PER_CORE		= 0x4,
>>>> +	IMC_COUNTER_PER_QUAD		= 0x8,
>> +	IMC_COUNTER_PER_CHIP		= 0x10,
> Only CHIP is used in this patch set.
>
> Why are we creating this opal API if only CHIP is ever used?  Can we delete the
> rest of these?
 From the OPAL patchset yes we only use the "CHIP",
but in the kernel side we use the "CORE" and "THREAD"
properties. That said, I can make this to map the
current OPAL counter type and remove the rest?

>
>> +	IMC_COUNTER_PER_SOCKET		= 0x20,
>>>> +	IMC_COUNTER_PER_SYSTEM		= 0x40,
>> +
>> +
>>   #endif /* __ASSEMBLY__ */
>>   
>>   #endif /* __OPAL_API_H */
>> diff --git a/include/platform.h b/include/platform.h
>> index 9133204207d6..246daf285e75 100644
>> --- a/include/platform.h
>> +++ b/include/platform.h
>> @@ -27,6 +27,7 @@ enum resource_id {
>>>   	RESOURCE_ID_KERNEL,
>>>   	RESOURCE_ID_INITRAMFS,
>>>   	RESOURCE_ID_CAPP,
>> +	RESOURCE_ID_CATALOG,
> This should be IMA_CATALOG.  The same as it's called in the PNOR.

Yes will changes.

Thanks for the review.

Maddy
>
>>   };
>>   #define RESOURCE_SUBID_NONE 0
>>   #define RESOURCE_SUBID_SUPPORTED 1



More information about the Skiboot mailing list