[PATCH v9 3/9] misc: smpro-errmon: Add Ampere's SMpro error monitor driver

Quan Nguyen quan at os.amperecomputing.com
Thu Oct 6 18:44:37 AEDT 2022



On 29/09/2022 16:53, Greg Kroah-Hartman wrote:
> On Thu, Sep 29, 2022 at 04:43:15PM +0700, Quan Nguyen wrote:
>> This commit adds Ampere's SMpro error monitor driver for monitoring
>> and reporting RAS-related errors as reported by SMpro co-processor
>> found on Ampere's Altra processor family.
>>
>> Signed-off-by: Quan Nguyen <quan at os.amperecomputing.com>
>> ---
>> Changes in v9:
>>    + Fix ugly static struct define                               [Greg]
>>    + Remove unused defines and update documentation              [Quan]
>>    + Add minor refactor code                                     [Quan]
>>    + Fix messy goto                                              [Greg]
>>    + Update SPDX licence                                         [Greg]
>>    + Use ATTRIBUTE_GROUPS()                                      [Greg]
>>    + Use dev_groups instead of sysfs_create_group() to avoid
>>    racing issue with user space                                  [Greg]
>>    + Refactor code to fix unnecessary initialization issue       [Quan]
>>    + Refactor code to avoid clever encoding issue                [Quan]
>>    + Separate error_[smpro|pmpro] to error_* and warn_*          [Quan]
>>    + Add minor code refactor                                     [Quan]
>>
>> Changes in v8:
>>    + Update wording for SMPRO_ERRMON on Kconfig file             [Quan]
>>    + Avoid uninitialized variable use               [kernel test robot]
>>    + Switch to use sysfs_emit()                                  [Greg]
>>    + Make sysfs to return single value                           [Greg]
>>    + Change errors_* sysfs to error_*                            [Quan]
>>    + Add overflow_[core|mem|pcie|other]_[ce|ue] sysfs to report
>>    overflow status of each type of HW errors                     [Quan]
>>    + Add some minor refactor                                     [Quan]
>>
>> Changes in v7:
>>    + Remove regmap_acquire/release_lock(), read_i2c_block_data() [Quan]
>>    + Use regmap_noinc_read() instead of errmon_read_block()      [Quan]
>>    + Validate number of errors before read                       [Quan]
>>    + Fix wrong return type of *_show() function     [kernel test robot]
>>    + Adjust patch order to avoid dependence with smpro-mfd  [Lee Jones]
>>    + Use pointer instead of stack memory                         [Quan]
>>
>> Changes in v6:
>>    + First introduced in v6 [Quan]
>>
>>   drivers/misc/Kconfig        |  12 +
>>   drivers/misc/Makefile       |   1 +
>>   drivers/misc/smpro-errmon.c | 529 ++++++++++++++++++++++++++++++++++++
>>   3 files changed, 542 insertions(+)
>>   create mode 100644 drivers/misc/smpro-errmon.c
>>
>> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
>> index 358ad56f6524..b9ceee949dab 100644
>> --- a/drivers/misc/Kconfig
>> +++ b/drivers/misc/Kconfig
>> @@ -176,6 +176,18 @@ config SGI_XP
>>   	  this feature will allow for direct communication between SSIs
>>   	  based on a network adapter and DMA messaging.
>>   
>> +config SMPRO_ERRMON
>> +	tristate "Ampere Computing SMPro error monitor driver"
>> +	depends on MFD_SMPRO || COMPILE_TEST
>> +	help
>> +	  Say Y here to get support for the SMpro error monitor function
>> +	  provided by Ampere Computing's Altra and Altra Max SoCs. Upon
>> +	  loading, the driver creates sysfs files which can be use to gather
>> +	  multiple HW error data reported via read and write system calls.
>> +
>> +	  To compile this driver as a module, say M here. The driver will be
>> +	  called smpro-errmon.
>> +
>>   config CS5535_MFGPT
>>   	tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support"
>>   	depends on MFD_CS5535
>> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
>> index ac9b3e757ba1..bbe24d4511a3 100644
>> --- a/drivers/misc/Makefile
>> +++ b/drivers/misc/Makefile
>> @@ -23,6 +23,7 @@ obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
>>   obj-$(CONFIG_KGDB_TESTS)	+= kgdbts.o
>>   obj-$(CONFIG_SGI_XP)		+= sgi-xp/
>>   obj-$(CONFIG_SGI_GRU)		+= sgi-gru/
>> +obj-$(CONFIG_SMPRO_ERRMON)	+= smpro-errmon.o
>>   obj-$(CONFIG_CS5535_MFGPT)	+= cs5535-mfgpt.o
>>   obj-$(CONFIG_GEHC_ACHC)		+= gehc-achc.o
>>   obj-$(CONFIG_HP_ILO)		+= hpilo.o
>> diff --git a/drivers/misc/smpro-errmon.c b/drivers/misc/smpro-errmon.c
>> new file mode 100644
>> index 000000000000..d1431d419aa4
>> --- /dev/null
>> +++ b/drivers/misc/smpro-errmon.c
>> @@ -0,0 +1,529 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Ampere Computing SoC's SMpro Error Monitoring Driver
>> + *
>> + * Copyright (c) 2022, Ampere Computing LLC
>> + *
>> + */
>> +
>> +#include <linux/i2c.h>
>> +#include <linux/mod_devicetable.h>
>> +#include <linux/module.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/regmap.h>
>> +
>> +/* GPI RAS Error Registers */
>> +#define GPI_RAS_ERR		0x7E
>> +
>> +/* Core and L2C Error Registers */
>> +#define CORE_CE_ERR_CNT		0x80
>> +#define CORE_CE_ERR_LEN		0x81
>> +#define CORE_CE_ERR_DATA	0x82
>> +#define CORE_UE_ERR_CNT		0x83
>> +#define CORE_UE_ERR_LEN		0x84
>> +#define CORE_UE_ERR_DATA	0x85
>> +
>> +/* Memory Error Registers */
>> +#define MEM_CE_ERR_CNT		0x90
>> +#define MEM_CE_ERR_LEN		0x91
>> +#define MEM_CE_ERR_DATA		0x92
>> +#define MEM_UE_ERR_CNT		0x93
>> +#define MEM_UE_ERR_LEN		0x94
>> +#define MEM_UE_ERR_DATA		0x95
>> +
>> +/* RAS Error/Warning Registers */
>> +#define ERR_SMPRO_TYPE		0xA0
>> +#define ERR_PMPRO_TYPE		0xA1
>> +#define ERR_SMPRO_INFO_LO	0xA2
>> +#define ERR_SMPRO_INFO_HI	0xA3
>> +#define ERR_SMPRO_DATA_LO	0xA4
>> +#define ERR_SMPRO_DATA_HI	0xA5
>> +#define WARN_SMPRO_INFO_LO	0xAA
>> +#define WARN_SMPRO_INFO_HI	0xAB
>> +#define ERR_PMPRO_INFO_LO	0xA6
>> +#define ERR_PMPRO_INFO_HI	0xA7
>> +#define ERR_PMPRO_DATA_LO	0xA8
>> +#define ERR_PMPRO_DATA_HI	0xA9
>> +#define WARN_PMPRO_INFO_LO	0xAC
>> +#define WARN_PMPRO_INFO_HI	0xAD
>> +
>> +/* PCIE Error Registers */
>> +#define PCIE_CE_ERR_CNT		0xC0
>> +#define PCIE_CE_ERR_LEN		0xC1
>> +#define PCIE_CE_ERR_DATA	0xC2
>> +#define PCIE_UE_ERR_CNT		0xC3
>> +#define PCIE_UE_ERR_LEN		0xC4
>> +#define PCIE_UE_ERR_DATA	0xC5
>> +
>> +/* Other Error Registers */
>> +#define OTHER_CE_ERR_CNT	0xD0
>> +#define OTHER_CE_ERR_LEN	0xD1
>> +#define OTHER_CE_ERR_DATA	0xD2
>> +#define OTHER_UE_ERR_CNT	0xD8
>> +#define OTHER_UE_ERR_LEN	0xD9
>> +#define OTHER_UE_ERR_DATA	0xDA
>> +
>> +/* Event Data Registers */
>> +#define VRD_WARN_FAULT_EVENT_DATA	0x78
>> +#define VRD_HOT_EVENT_DATA		0x79
>> +#define DIMM_HOT_EVENT_DATA		0x7A
>> +
>> +#define MAX_READ_BLOCK_LENGTH	48
>> +
>> +#define RAS_SMPRO_ERR		0
>> +#define RAS_PMPRO_ERR		1
>> +
>> +enum RAS_48BYTES_ERR_TYPES {
>> +	CORE_CE_ERR,
>> +	CORE_UE_ERR,
>> +	MEM_CE_ERR,
>> +	MEM_UE_ERR,
>> +	PCIE_CE_ERR,
>> +	PCIE_UE_ERR,
>> +	OTHER_CE_ERR,
>> +	OTHER_UE_ERR,
>> +	NUM_48BYTES_ERR_TYPE,
>> +};
>> +
>> +struct smpro_error_hdr {
>> +	u8 count;	/* Number of the RAS errors */
>> +	u8 len;		/* Number of data bytes */
>> +	u8 data;	/* Start of 48-byte data */
>> +	u8 max_cnt;	/* Max num of errors */
>> +};
>> +
>> +/*
>> + * Included Address of registers to get Count, Length of data and Data
>> + * of the 48 bytes error data
>> + */
>> +static struct smpro_error_hdr smpro_error_table[] = {
>> +	[CORE_CE_ERR] = {
>> +		.count = CORE_CE_ERR_CNT,
>> +		.len = CORE_CE_ERR_LEN,
>> +		.data = CORE_CE_ERR_DATA,
>> +		.max_cnt = 32
>> +	},
>> +	[CORE_UE_ERR] = {
>> +		.count = CORE_UE_ERR_CNT,
>> +		.len = CORE_UE_ERR_LEN,
>> +		.data = CORE_UE_ERR_DATA,
>> +		.max_cnt = 32
>> +	},
>> +	[MEM_CE_ERR] = {
>> +		.count = MEM_CE_ERR_CNT,
>> +		.len = MEM_CE_ERR_LEN,
>> +		.data = MEM_CE_ERR_DATA,
>> +		.max_cnt = 16
>> +	},
>> +	[MEM_UE_ERR] = {
>> +		.count = MEM_UE_ERR_CNT,
>> +		.len = MEM_UE_ERR_LEN,
>> +		.data = MEM_UE_ERR_DATA,
>> +		.max_cnt = 16
>> +	},
>> +	[PCIE_CE_ERR] = {
>> +		.count = PCIE_CE_ERR_CNT,
>> +		.len = PCIE_CE_ERR_LEN,
>> +		.data = PCIE_CE_ERR_DATA,
>> +		.max_cnt = 96
>> +	},
>> +	[PCIE_UE_ERR] = {
>> +		.count = PCIE_UE_ERR_CNT,
>> +		.len = PCIE_UE_ERR_LEN,
>> +		.data = PCIE_UE_ERR_DATA,
>> +		.max_cnt = 96
>> +	},
>> +	[OTHER_CE_ERR] = {
>> +		.count = OTHER_CE_ERR_CNT,
>> +		.len = OTHER_CE_ERR_LEN,
>> +		.data = OTHER_CE_ERR_DATA,
>> +		.max_cnt = 8
>> +	},
>> +	[OTHER_UE_ERR] = {
>> +		.count = OTHER_UE_ERR_CNT,
>> +		.len = OTHER_UE_ERR_LEN,
>> +		.data = OTHER_UE_ERR_DATA,
>> +		.max_cnt = 8
>> +	},
>> +};
>> +
>> +/*
>> + * List of SCP registers which are used to get
>> + * one type of RAS Internal errors.
>> + */
>> +struct smpro_int_error_hdr {
>> +	u8 type;
>> +	u8 info_l;
>> +	u8 info_h;
>> +	u8 data_l;
>> +	u8 data_h;
>> +	u8 warn_l;
>> +	u8 warn_h;
>> +};
>> +
>> +static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = {
>> +	[RAS_SMPRO_ERR] = {
>> +		.type = ERR_SMPRO_TYPE,
>> +		.info_l = ERR_SMPRO_INFO_LO,
>> +		.info_h = ERR_SMPRO_INFO_HI,
>> +		.data_l = ERR_SMPRO_DATA_LO,
>> +		.data_h = ERR_SMPRO_DATA_HI,
>> +		.warn_l = WARN_SMPRO_INFO_LO,
>> +		.warn_h = WARN_SMPRO_INFO_HI,
>> +	},
>> +	[RAS_PMPRO_ERR] = {
>> +		.type = ERR_PMPRO_TYPE,
>> +		.info_l = ERR_PMPRO_INFO_LO,
>> +		.info_h = ERR_PMPRO_INFO_HI,
>> +		.data_l = ERR_PMPRO_DATA_LO,
>> +		.data_h = ERR_PMPRO_DATA_HI,
>> +		.warn_l = WARN_PMPRO_INFO_LO,
>> +		.warn_h = WARN_PMPRO_INFO_HI,
>> +	},
>> +};
>> +
>> +struct smpro_errmon {
>> +	struct regmap *regmap;
>> +};
>> +
>> +enum EVENT_TYPES {
>> +	VRD_WARN_FAULT_EVENT,
>> +	VRD_HOT_EVENT,
>> +	DIMM_HOT_EVENT,
>> +	NUM_EVENTS_TYPE,
>> +};
>> +
>> +/* Included Address of event source and data registers */
>> +static u8 smpro_event_table[NUM_EVENTS_TYPE] = {
>> +	VRD_WARN_FAULT_EVENT_DATA,
>> +	VRD_HOT_EVENT_DATA,
>> +	DIMM_HOT_EVENT_DATA,
>> +};
>> +
>> +static ssize_t smpro_event_data_read(struct device *dev,
>> +				     struct device_attribute *da, char *buf,
>> +				     int channel)
>> +{
>> +	struct smpro_errmon *errmon = dev_get_drvdata(dev);
>> +	s32 event_data;
>> +	int ret;
>> +
>> +	ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data);
>> +	if (ret)
>> +		return ret;
>> +	/* Clear event after read */
>> +	if (event_data != 0)
>> +		regmap_write(errmon->regmap, smpro_event_table[channel], event_data);
>> +
>> +	return sysfs_emit(buf, "%04x\n", event_data);
>> +}
>> +
>> +static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da,
>> +					char *buf, int channel)
>> +{
>> +	struct smpro_errmon *errmon = dev_get_drvdata(dev);
>> +	struct smpro_error_hdr *err_info;
>> +	s32 err_count;
>> +	int ret;
>> +
>> +	err_info = &smpro_error_table[channel];
>> +
>> +	ret = regmap_read(errmon->regmap, err_info->count, &err_count);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* Bit 8 indicates the overflow status */
>> +	return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0);
>> +}
> 
> Where is the Documentation/ABI/ entry for this field?
> 
It is documented under the entry for the 
"/sys/bus/platform/devices/smpro-errmon.*/overflow_[core|mem|pcie|other]_[ce|ue]" 
in the [9/9] patch.

> Please put that in the same commit so that it is easier to validate that
> you really did document everything properly.
> 

I'll follow this suggestion in next version.

Thank your for the reviews,
- Quan


More information about the openbmc mailing list