[linux, dev-4.10, 6/6] drivers/hwmon: Add a driver for a generic PECI hwmon
Jae Hyun Yoo
jae.hyun.yoo at linux.intel.com
Fri Jan 12 06:47:01 AEDT 2018
On 1/10/2018 1:47 PM, Guenter Roeck wrote:
> On Tue, Jan 09, 2018 at 02:31:26PM -0800, Jae Hyun Yoo wrote:
>> This commit adds driver implementation for a generic PECI hwmon.
>>
>> Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo at linux.intel.com>
>> ---
>> drivers/hwmon/Kconfig | 6 +
>> drivers/hwmon/Makefile | 1 +
>> drivers/hwmon/peci-hwmon.c | 953 +++++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 960 insertions(+)
>> create mode 100644 drivers/hwmon/peci-hwmon.c
>>
>> diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
>> index 9256dd0..3a62c60 100644
>> --- a/drivers/hwmon/Kconfig
>> +++ b/drivers/hwmon/Kconfig
>> @@ -1234,6 +1234,12 @@ config SENSORS_NCT7904
>> This driver can also be built as a module. If so, the module
>> will be called nct7904.
>>
>> +config SENSORS_PECI_HWMON
>> + tristate "PECI hwmon support"
>> + depends on ASPEED_PECI
>> + help
>> + If you say yes here you get support for the generic PECI hwmon driver.
>> +
>> config SENSORS_NSA320
>> tristate "ZyXEL NSA320 and compatible fan speed and temperature sensors"
>> depends on GPIOLIB && OF
>> diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
>> index 98000fc..41d43a5 100644
>> --- a/drivers/hwmon/Makefile
>> +++ b/drivers/hwmon/Makefile
>> @@ -131,6 +131,7 @@ obj-$(CONFIG_SENSORS_NCT7802) += nct7802.o
>> obj-$(CONFIG_SENSORS_NCT7904) += nct7904.o
>> obj-$(CONFIG_SENSORS_NSA320) += nsa320-hwmon.o
>> obj-$(CONFIG_SENSORS_NTC_THERMISTOR) += ntc_thermistor.o
>> +obj-$(CONFIG_SENSORS_PECI_HWMON) += peci-hwmon.o
>> obj-$(CONFIG_SENSORS_PC87360) += pc87360.o
>> obj-$(CONFIG_SENSORS_PC87427) += pc87427.o
>> obj-$(CONFIG_SENSORS_PCF8591) += pcf8591.o
>> diff --git a/drivers/hwmon/peci-hwmon.c b/drivers/hwmon/peci-hwmon.c
>> new file mode 100644
>> index 0000000..2d2a288
>> --- /dev/null
>> +++ b/drivers/hwmon/peci-hwmon.c
>> @@ -0,0 +1,953 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +// Copyright (c) 2017 Intel Corporation
>> +
>> +#include <linux/delay.h>
>> +#include <linux/hwmon.h>
>> +#include <linux/hwmon-sysfs.h>
>> +#include <linux/jiffies.h>
>> +#include <linux/module.h>
>> +#include <linux/mutex.h>
>> +#include <linux/of_platform.h>
>> +#include <linux/of_device.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/syscalls.h>
>> +#include <misc/peci.h>
>
> misc, not linux ? That seems wrong.
>
You are right. I'll fix it.
>> +
>> +#define DEVICE_NAME "peci-hwmon"
>> +#define HWMON_NAME "peci_hwmon"
>> +
>> +#define CPU_ID_MAX 8 /* Max CPU number configured by socket ID */
>> +#define DIMM_NUMS_MAX 16 /* Max DIMM numbers (channel ranks x 2) */
>> +#define CORE_NUMS_MAX 28 /* Max core numbers (max on SKX Platinum) */
>
> I won't insist, but it would be better if this were dynamic,
> otherwise we'll end up having to increase the defines in the future.
>
Right. As you said, these values should be manually adjusted in the
future if CPU architecture has been changed so better implement it as
dynamic. I will check again a way of getting these values from client
CPU thru PECI connection.
>> +#define TEMP_TYPE_PECI 6 /* Sensor type 6: Intel PECI */
>> +#define CORE_INDEX_OFFSET 100 /* sysfs filename start offset for core temp */
>> +#define DIMM_INDEX_OFFSET 200 /* sysfs filename start offset for DIMM temp */
>
> Did you test with the "sensors" command to ensure that this works,
> with the large gaps in index values ?
>
> Overall, I am not very happy with the indexing. Since each sensor as
> a label, it might be better to just make it dynamic.
>
Okay, that makes sense. Since all attributes has its own label, indexing
gap wouldn't be needed even in case of CPU architecture change happens.
I'll remove the indexing gap.
>> +#define TEMP_NAME_HEADER_LEN 4 /* sysfs temp type header length */
>> +#define OF_DIMM_NUMS_DEFAULT 16 /* default dimm-nums setting */
>> +
>> +#define CORE_TEMP_ATTRS 5
>> +#define DIMM_TEMP_ATTRS 2
>> +#define ATTR_NAME_LEN 24
>> +
>> +#define UPDATE_INTERVAL_MIN HZ
>> +
>> +enum sign_t {
>> + POS,
>> + NEG
>> +};
>> +
>> +struct cpuinfo_t {
>> + bool valid;
>> + u32 dib;
>> + u8 cpuid;
>> + u8 platform_id;
>> + u32 microcode;
>> + u8 logical_thread_nums;
>> +};
>> +
>> +struct temp_data_t {
>> + bool valid;
>> + s32 value;
>> + unsigned long last_updated;
>> +};
>> +
>> +struct temp_group_t {
>> + struct temp_data_t tjmax;
>> + struct temp_data_t tcontrol;
>> + struct temp_data_t tthrottle;
>> + struct temp_data_t dts_margin;
>> + struct temp_data_t die;
>> + struct temp_data_t core[CORE_NUMS_MAX];
>> + struct temp_data_t dimm[DIMM_NUMS_MAX];
>> +};
>> +
>> +struct core_temp_attr_group_t {
>> + struct sensor_device_attribute sd_attrs[CORE_NUMS_MAX][CORE_TEMP_ATTRS];
>> + char attr_name[CORE_NUMS_MAX][CORE_TEMP_ATTRS][ATTR_NAME_LEN];
>> + struct attribute *attrs[CORE_NUMS_MAX][CORE_TEMP_ATTRS + 1];
>> + struct attribute_group attr_group[CORE_NUMS_MAX];
>> +};
>> +
>> +struct dimm_temp_attr_group_t {
>> + struct sensor_device_attribute sd_attrs[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS];
>> + char attr_name[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS][ATTR_NAME_LEN];
>> + struct attribute *attrs[DIMM_NUMS_MAX][DIMM_TEMP_ATTRS + 1];
>> + struct attribute_group attr_group[DIMM_NUMS_MAX];
>> +};
>> +
>> +struct peci_hwmon {
>> + struct device *dev;
>> + struct device *hwmon_dev;
>> + char name[NAME_MAX];
>> + const struct attribute_group **groups;
>> + struct cpuinfo_t cpuinfo;
>> + struct temp_group_t temp;
>> + u32 cpu_id;
>> + bool show_core;
>> + u32 core_nums;
>> + u32 dimm_nums;
>> + atomic_t core_group_created;
>> + struct core_temp_attr_group_t core;
>> + struct dimm_temp_attr_group_t dimm;
>> +};
>> +
>> +enum label_t {
>> + L_DIE,
>> + L_DTS,
>> + L_TCONTROL,
>> + L_TTHROTTLE,
>> + L_MAX
>> +};
>> +
>> +static const char *peci_label[L_MAX] = {
>> + "Die temperature\n",
>> + "DTS thermal margin to Tcontrol\n",
>> + "Tcontrol temperature\n",
>> + "Tthrottle temperature\n",
>
> "temperature" is redundant for a temperature label.
>
Agreed. Will remove the redundant string.
>> +};
>> +
>> +static DEFINE_MUTEX(peci_hwmon_lock);
>> +
>> +static int create_core_temp_group(struct peci_hwmon *priv, int core_no);
>
> Please avoid forward declarations.
>
Will fix it.
>> +
>> +
>
> Please run your patches throuch checkpatch --strict and fix what it reports,
> or provide a reason why you don't.
>
Thanks for the tip. I didn't know about the --strict option before.
Checked that the option reports more check points such as this multiple
blank line case. I will run all patches again using --strict option.
>> +static int xfer_peci_msg(int cmd, void *pmsg)
>> +{
>> + int rc;
>> +
>> + mutex_lock(&peci_hwmon_lock);
>> + rc = peci_ioctl(NULL, cmd, (unsigned long)pmsg);
>> + mutex_unlock(&peci_hwmon_lock);
>> +
>> + return rc;
>> +}
>> +
>> +static int get_cpuinfo(struct peci_hwmon *priv)
>> +{
>> + struct peci_get_dib_msg dib_msg;
>> + struct peci_rd_pkg_cfg_msg cfg_msg;
>> + int rc, i;
>> +
>> + if (!priv->cpuinfo.valid) {
>> + dib_msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_GET_DIB, (void *)&dib_msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + priv->cpuinfo.dib = dib_msg.dib;
>> +
>> + cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> + cfg_msg.index = MBX_INDEX_CPU_ID;
>> + cfg_msg.param = 0;
>> + cfg_msg.rx_len = 4;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + priv->cpuinfo.cpuid = cfg_msg.pkg_config[0];
>> +
>> + cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> + cfg_msg.index = MBX_INDEX_CPU_ID;
>> + cfg_msg.param = 1;
>> + cfg_msg.rx_len = 4;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + priv->cpuinfo.platform_id = cfg_msg.pkg_config[0];
>> +
>> + cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> + cfg_msg.index = MBX_INDEX_CPU_ID;
>> + cfg_msg.param = 3;
>> + cfg_msg.rx_len = 4;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + priv->cpuinfo.logical_thread_nums = cfg_msg.pkg_config[0] + 1;
>> +
>> + cfg_msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> + cfg_msg.index = MBX_INDEX_CPU_ID;
>> + cfg_msg.param = 4;
>> + cfg_msg.rx_len = 4;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&cfg_msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + priv->cpuinfo.microcode = (cfg_msg.pkg_config[3] << 24) |
>> + (cfg_msg.pkg_config[2] << 16) |
>> + (cfg_msg.pkg_config[1] << 8) |
>> + cfg_msg.pkg_config[0];
>> +
>> + priv->core_nums = priv->cpuinfo.logical_thread_nums / 2;
>
> This seems to assume a 1:2 relationship between number of threads and
> number of CPUs, which is incorrect.
>
You are right. This can't cover all CPUs. Will find a proper way.
>> +
>> + if (priv->show_core &&
>> + atomic_inc_return(&priv->core_group_created) == 1) {
>> + for (i = 0; i < priv->core_nums; i++) {
>> + rc = create_core_temp_group(priv, i);
>
> This is messy. Sensor groups should be created before or during
> hwmon registration, not at some arbitrary later time.
>
> I don't know the logic behind this, but if it is supposed to track CPUs
> coming online and going offline it is the wrong approach.
>
Agreed. This driver wouldn't make communication with CPUs at all if CPUs
are powered down so we don't need to leave this driver as inserted. As
you commented below, if a agent does insert/remove this driver module
while tracking CPU power state, this delayed creation logic wouldn't be
needed. I will rewrite it.
>> + if (rc != 0) {
>> + dev_err(priv->dev,
>> + "Failed to create core temp group\n");
>> + for (--i; i >= 0; i--) {
>> + sysfs_remove_group(
>> + &priv->hwmon_dev->kobj,
>> + &priv->core.attr_group[i]);
>> + }
>> + atomic_set(&priv->core_group_created,
>> + 0);
>> + return rc;
>> + }
>> + }
>> + }
>> +
>> + priv->cpuinfo.valid = true;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int get_tjmax(struct peci_hwmon *priv)
>> +{
>> + struct peci_rd_pkg_cfg_msg msg;
>> + int rc;
>> +
>> + rc = get_cpuinfo(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + if (!priv->temp.tjmax.valid) {
>> + msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> + msg.index = MBX_INDEX_TEMP_TARGET;
>> + msg.param = 0;
>> + msg.rx_len = 4;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + priv->temp.tjmax.value = (s32)msg.pkg_config[2] * 1000;
>> + priv->temp.tjmax.valid = true;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int get_tcontrol(struct peci_hwmon *priv)
>> +{
>> + struct peci_rd_pkg_cfg_msg msg;
>> + s32 tcontrol_margin;
>> + int rc;
>> +
>> + if (priv->temp.tcontrol.valid &&
>> + time_before(jiffies, priv->temp.tcontrol.last_updated +
>> + UPDATE_INTERVAL_MIN))
>> + return 0;
>> +
>
> Is the delay necessary ? Otherwise I would suggest to drop it.
> It adds a lot of complexity to the driver. Also, if the user polls
> values more often, that is presumably on purpose.
>
I was intended to reduce traffic on PECI bus because it's low speed
single wired bus, and temperature values don't change frequently because
the value is sampled and averaged in CPU itself. I'll keep this.
>> + rc = get_tjmax(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> + msg.index = MBX_INDEX_TEMP_TARGET;
>> + msg.param = 0;
>> + msg.rx_len = 4;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + tcontrol_margin = msg.pkg_config[1];
>> + tcontrol_margin = ((tcontrol_margin ^ 0x80) - 0x80) * 1000;
>> +
>> + priv->temp.tcontrol.value = priv->temp.tjmax.value - tcontrol_margin;
>> +
>> + if (!priv->temp.tcontrol.valid) {
>> + priv->temp.tcontrol.last_updated = INITIAL_JIFFIES;
>> + priv->temp.tcontrol.valid = true;
>> + } else {
>> + priv->temp.tcontrol.last_updated = jiffies;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int get_tthrottle(struct peci_hwmon *priv)
>> +{
>> + struct peci_rd_pkg_cfg_msg msg;
>> + s32 tthrottle_offset;
>> + int rc;
>> +
>> + if (priv->temp.tthrottle.valid &&
>> + time_before(jiffies, priv->temp.tthrottle.last_updated +
>> + UPDATE_INTERVAL_MIN))
>> + return 0;
>> +
>> + rc = get_tjmax(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> + msg.index = MBX_INDEX_TEMP_TARGET;
>> + msg.param = 0;
>> + msg.rx_len = 4;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + tthrottle_offset = (msg.pkg_config[3] & 0x2f) * 1000;
>> + priv->temp.tthrottle.value = priv->temp.tjmax.value - tthrottle_offset;
>> +
>> + if (!priv->temp.tthrottle.valid) {
>> + priv->temp.tthrottle.last_updated = INITIAL_JIFFIES;
>> + priv->temp.tthrottle.valid = true;
>> + } else {
>> + priv->temp.tthrottle.last_updated = jiffies;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int get_die_temp(struct peci_hwmon *priv)
>> +{
>> + struct peci_get_temp_msg msg;
>> + int rc;
>> +
>> + if (priv->temp.die.valid &&
>> + time_before(jiffies, priv->temp.die.last_updated +
>> + UPDATE_INTERVAL_MIN))
>> + return 0;
>> +
>> + rc = get_tjmax(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_GET_TEMP, (void *)&msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + priv->temp.die.value = priv->temp.tjmax.value +
>> + ((s32)msg.temp_raw * 1000 / 64);
>> +
>> + if (!priv->temp.die.valid) {
>> + priv->temp.die.last_updated = INITIAL_JIFFIES;
>> + priv->temp.die.valid = true;
>> + } else {
>> + priv->temp.die.last_updated = jiffies;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int get_dts_margin(struct peci_hwmon *priv)
>> +{
>> + struct peci_rd_pkg_cfg_msg msg;
>> + s32 dts_margin;
>> + int rc;
>> +
>> + if (priv->temp.dts_margin.valid &&
>> + time_before(jiffies, priv->temp.dts_margin.last_updated +
>> + UPDATE_INTERVAL_MIN))
>> + return 0;
>> +
> Are all those values expected to change dynamically, or are some static ?
> Static values do not have to be re-read repeatedly but can be cached
> permanently.
>
All values expected to change dynamically except the Tjmax value which
is fused in CPU package, but the Tjmax varies on each CPU package so we
need to read the Tjmax at least once. Current implementation uses cached
Tjmax value after the first reading on the value.
>> + rc = get_cpuinfo(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> + msg.index = MBX_INDEX_DTS_MARGIN;
>> + msg.param = 0;
>> + msg.rx_len = 4;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + dts_margin = (msg.pkg_config[1] << 8) | msg.pkg_config[0];
>> +
>> + /*
>> + * Processors return a value of DTS reading in 10.6 format
>> + * (10 bits signed decimal, 6 bits fractional).
>> + * Error codes:
>> + * 0x8000: General sensor error
>> + * 0x8001: Reserved
>> + * 0x8002: Underflow on reading value
>> + * 0x8003-0x81ff: Reserved
>> + */
>> + if (dts_margin >= 0x8000 && dts_margin <= 0x81ff)
>> + return -1;
>> +
>> + dts_margin = ((dts_margin ^ 0x8000) - 0x8000) * 1000 / 64;
>> +
> The above code is repeated several times. Please consider moving it
> into a function to reduce duplication.
>
Okay. I will move it to a function.
>> + priv->temp.dts_margin.value = dts_margin;
>> +
>> + if (!priv->temp.dts_margin.valid) {
>> + priv->temp.dts_margin.last_updated = INITIAL_JIFFIES;
>> + priv->temp.dts_margin.valid = true;
>> + } else {
>> + priv->temp.dts_margin.last_updated = jiffies;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int get_core_temp(struct peci_hwmon *priv, int core_index)
>> +{
>> + struct peci_rd_pkg_cfg_msg msg;
>> + s32 core_dts_margin;
>> + int rc;
>> +
>> + if (priv->temp.core[core_index].valid &&
>> + time_before(jiffies, priv->temp.core[core_index].last_updated +
>> + UPDATE_INTERVAL_MIN))
>> + return 0;
>> +
>> + rc = get_tjmax(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> + msg.index = MBX_INDEX_PER_CORE_DTS_TEMP;
>> + msg.param = core_index;
>> + msg.rx_len = 4;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + core_dts_margin = (msg.pkg_config[1] << 8) | msg.pkg_config[0];
>> +
>> + /*
>> + * Processors return a value of the core DTS reading in 10.6 format
>> + * (10 bits signed decimal, 6 bits fractional).
>> + * Error codes:
>> + * 0x8000: General sensor error
>> + * 0x8001: Reserved
>> + * 0x8002: Underflow on reading value
>> + * 0x8003-0x81ff: Reserved
>> + */
>> + if (core_dts_margin >= 0x8000 && core_dts_margin <= 0x81ff)
>> + return -1;
>> +
>> + core_dts_margin = ((core_dts_margin ^ 0x8000) - 0x8000) * 1000 / 64;
>> +
>> + priv->temp.core[core_index].value = priv->temp.tjmax.value +
>> + core_dts_margin;
>> +
>> + if (!priv->temp.core[core_index].valid) {
>> + priv->temp.core[core_index].last_updated = INITIAL_JIFFIES;
>> + priv->temp.core[core_index].valid = true;
>> + } else {
>> + priv->temp.core[core_index].last_updated = jiffies;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int get_dimm_temp(struct peci_hwmon *priv, int dimm_index)
>> +{
>> + struct peci_rd_pkg_cfg_msg msg;
>> + int channel_rank = dimm_index / 2;
>> + int dimm_order = dimm_index % 2;
>> + int rc;
>> +
>> + if (priv->temp.core[dimm_index].valid &&
>> + time_before(jiffies, priv->temp.core[dimm_index].last_updated +
>> + UPDATE_INTERVAL_MIN))
>> + return 0;
>> +
>> + rc = get_cpuinfo(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + msg.target = PECI_BASE_ADDR + priv->cpu_id;
>> + msg.index = MBX_INDEX_DDR_DIMM_TEMP;
>> + msg.param = channel_rank;
>> + msg.rx_len = 4;
>> +
>> + rc = xfer_peci_msg(PECI_IOC_RD_PKG_CFG, (void *)&msg);
>> + if (rc < 0)
>> + return rc;
>> +
>> + priv->temp.dimm[dimm_index].value = msg.pkg_config[dimm_order] * 1000;
>> +
>> + if (!priv->temp.dimm[dimm_index].valid) {
>> + priv->temp.dimm[dimm_index].last_updated = INITIAL_JIFFIES;
>> + priv->temp.dimm[dimm_index].valid = true;
>> + } else {
>> + priv->temp.dimm[dimm_index].last_updated = jiffies;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static ssize_t show_info(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct peci_hwmon *priv = dev_get_drvdata(dev);
>> + int rc;
>> +
>> + rc = get_cpuinfo(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + return sprintf(buf, "dib : 0x%08x\n"
>> + "cpuid : 0x%x\n"
>> + "platform id : %d\n"
>> + "stepping : %d\n"
>> + "microcode : 0x%08x\n"
>> + "logical thread nums : %d\n",
>> + priv->cpuinfo.dib,
>> + priv->cpuinfo.cpuid,
>> + priv->cpuinfo.platform_id,
>> + priv->cpuinfo.cpuid & 0xf,
>> + priv->cpuinfo.microcode,
>> + priv->cpuinfo.logical_thread_nums);
>> +}
>
> Please no non-standard attributes, much less attributes not following sysfs
> attribute rules. If you want to display such information, consider using
> debugfs. Besides, this information specifically appears to duplicate
> the content of /proc/cpuid, which doesn't really add any value at all.
>
Got it. Will drop this non-standard attribute.
>> +
>> +static ssize_t show_tcontrol(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct peci_hwmon *priv = dev_get_drvdata(dev);
>> + int rc;
>> +
>> + rc = get_tcontrol(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + return sprintf(buf, "%d\n", priv->temp.tcontrol.value);
>> +}
>> +
>> +static ssize_t show_tcontrol_margin(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct peci_hwmon *priv = dev_get_drvdata(dev);
>> + struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> + int rc;
>> +
>> + rc = get_tcontrol(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + return sprintf(buf, "%d\n", sensor_attr->index == POS ?
>> + priv->temp.tjmax.value -
>> + priv->temp.tcontrol.value :
>> + priv->temp.tcontrol.value -
>> + priv->temp.tjmax.value);
>> +}
>> +
>> +static ssize_t show_tthrottle(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct peci_hwmon *priv = dev_get_drvdata(dev);
>> + int rc;
>> +
>> + rc = get_tthrottle(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + return sprintf(buf, "%d\n", priv->temp.tthrottle.value);
>> +}
>> +
>> +static ssize_t show_tjmax(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct peci_hwmon *priv = dev_get_drvdata(dev);
>> + int rc;
>> +
>> + rc = get_tjmax(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + return sprintf(buf, "%d\n", priv->temp.tjmax.value);
>> +}
>> +
>> +static ssize_t show_die_temp(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct peci_hwmon *priv = dev_get_drvdata(dev);
>> + int rc;
>> +
>> + rc = get_die_temp(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + return sprintf(buf, "%d\n", priv->temp.die.value);
>> +}
>> +
>> +static ssize_t show_dts_therm_margin(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct peci_hwmon *priv = dev_get_drvdata(dev);
>> + int rc;
>> +
>> + rc = get_dts_margin(priv);
>> + if (rc < 0)
>> + return rc;
>> +
>> + return sprintf(buf, "%d\n", priv->temp.dts_margin.value);
>> +}
>> +
>> +static ssize_t show_core_temp(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct peci_hwmon *priv = dev_get_drvdata(dev);
>> + struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> + int core_index = sensor_attr->index;
>> + int rc;
>> +
>> + rc = get_core_temp(priv, core_index);
>> + if (rc < 0)
>> + return rc;
>> +
>> + return sprintf(buf, "%d\n", priv->temp.core[core_index].value);
>> +}
>> +
>> +static ssize_t show_dimm_temp(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct peci_hwmon *priv = dev_get_drvdata(dev);
>> + struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> + int dimm_index = sensor_attr->index;
>> + int rc;
>> +
>> + rc = get_dimm_temp(priv, dimm_index);
>> + if (rc < 0)
>> + return rc;
>> +
>> + return sprintf(buf, "%d\n", priv->temp.dimm[dimm_index].value);
>> +}
>> +
>> +static ssize_t show_value(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +
>> + return sprintf(buf, "%d\n", sensor_attr->index);
>> +}
>> +
>> +static ssize_t show_label(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +
>> + return sprintf(buf, peci_label[sensor_attr->index]);
>> +}
>> +
>> +static ssize_t show_core_label(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +
>> + return sprintf(buf, "Core #%d temperature\n", sensor_attr->index);
>> +}
>
> Your label strings are quite long. How does that look like with the
> sensors command ?
>
> Plus, again, "temperature" in a temperature label is redundant.
>
Agreed. Will remove the redundant string.
>> +
>> +static ssize_t show_dimm_label(struct device *dev,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + struct sensor_device_attribute *sensor_attr = to_sensor_dev_attr(attr);
>> +
>> + char channel = 'A' + (sensor_attr->index / 2);
>> + int index = sensor_attr->index % 2;
>> +
>> + return sprintf(buf, "Channel Rank %c DDR DIMM #%d temperature\n",
>> + channel, index);
>> +}
>> +
>> +/* Die temperature */
>> +static SENSOR_DEVICE_ATTR(temp1_label, 0444, show_label, NULL, L_DIE);
>> +static SENSOR_DEVICE_ATTR(temp1_input, 0444, show_die_temp, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp1_max, 0444, show_tcontrol, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp1_crit, 0444, show_tjmax, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp1_crit_hyst, 0444, show_tcontrol_margin, NULL,
>> + POS);
>> +
>> +static struct attribute *die_temp_attrs[] = {
>> + &sensor_dev_attr_temp1_label.dev_attr.attr,
>> + &sensor_dev_attr_temp1_input.dev_attr.attr,
>> + &sensor_dev_attr_temp1_max.dev_attr.attr,
>> + &sensor_dev_attr_temp1_crit.dev_attr.attr,
>> + &sensor_dev_attr_temp1_crit_hyst.dev_attr.attr,
>> + NULL
>> +};
>> +
>> +static const struct attribute_group die_temp_attr_group = {
>> + .attrs = die_temp_attrs,
>> +};
>> +
>> +/* DTS thermal margin temperature */
>> +static SENSOR_DEVICE_ATTR(temp2_label, 0444, show_label, NULL, L_DTS);
>> +static SENSOR_DEVICE_ATTR(temp2_input, 0444, show_dts_therm_margin, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp2_min, 0444, show_value, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp2_lcrit, 0444, show_tcontrol_margin, NULL, NEG);
>> +
>> +static struct attribute *dts_margin_temp_attrs[] = {
>> + &sensor_dev_attr_temp2_label.dev_attr.attr,
>> + &sensor_dev_attr_temp2_input.dev_attr.attr,
>> + &sensor_dev_attr_temp2_min.dev_attr.attr,
>> + &sensor_dev_attr_temp2_lcrit.dev_attr.attr,
>> + NULL
>> +};
>> +
>> +static const struct attribute_group dts_margin_temp_attr_group = {
>> + .attrs = dts_margin_temp_attrs,
>> +};
>> +
>> +/* Tcontrol temperature */
>> +static SENSOR_DEVICE_ATTR(temp3_label, 0444, show_label, NULL, L_TCONTROL);
>> +static SENSOR_DEVICE_ATTR(temp3_input, 0444, show_tcontrol, NULL, 0);
>> +static SENSOR_DEVICE_ATTR(temp3_crit, 0444, show_tjmax, NULL, 0);
>> +
>> +static struct attribute *tcontrol_temp_attrs[] = {
>> + &sensor_dev_attr_temp3_label.dev_attr.attr,
>> + &sensor_dev_attr_temp3_input.dev_attr.attr,
>> + &sensor_dev_attr_temp3_crit.dev_attr.attr,
>> + NULL
>> +};
>> +
>> +static const struct attribute_group tcontrol_temp_attr_group = {
>> + .attrs = tcontrol_temp_attrs,
>> +};
>> +
>> +/* Tthrottle temperature */
>> +static SENSOR_DEVICE_ATTR(temp4_label, 0444, show_label, NULL, L_TTHROTTLE);
>> +static SENSOR_DEVICE_ATTR(temp4_input, 0444, show_tthrottle, NULL, 0);
>> +
>> +static struct attribute *tthrottle_temp_attrs[] = {
>> + &sensor_dev_attr_temp4_label.dev_attr.attr,
>> + &sensor_dev_attr_temp4_input.dev_attr.attr,
>> + NULL
>> +};
>> +
>> +static const struct attribute_group tthrottle_temp_attr_group = {
>> + .attrs = tthrottle_temp_attrs,
>> +};
>> +
>> +/* CPU info */
>> +static SENSOR_DEVICE_ATTR(info, 0444, show_info, NULL, 0);
>> +
>> +static struct attribute *info_attrs[] = {
>> + &sensor_dev_attr_info.dev_attr.attr,
>> + NULL
>> +};
>> +
>> +static const struct attribute_group info_attr_group = {
>> + .attrs = info_attrs,
>> +};
>> +
>> +const struct attribute_group *peci_hwmon_attr_groups[] = {
>> + &info_attr_group,
>> + &die_temp_attr_group,
>> + &dts_margin_temp_attr_group,
>> + &tcontrol_temp_attr_group,
>> + &tthrottle_temp_attr_group,
>> + NULL
>> +};
>> +
>> +static ssize_t (*const core_show_fn[CORE_TEMP_ATTRS]) (struct device *dev,
>> + struct device_attribute *devattr, char *buf) = {
>> + show_core_label,
>> + show_core_temp,
>> + show_tcontrol,
>> + show_tjmax,
>> + show_tcontrol_margin,
>> +};
>> +
>> +static const char *const core_suffix[CORE_TEMP_ATTRS] = {
>> + "label",
>> + "input",
>> + "max",
>> + "crit",
>> + "crit_hyst",
>> +};
>> +
>> +static int create_core_temp_group(struct peci_hwmon *priv, int core_no)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < CORE_TEMP_ATTRS; i++) {
>> + snprintf(priv->core.attr_name[core_no][i],
>> + ATTR_NAME_LEN, "temp%d_%s",
>> + CORE_INDEX_OFFSET + core_no, core_suffix[i]);
>> + sysfs_attr_init(
>> + &priv->core.sd_attrs[core_no][i].dev_attr.attr);
>> + priv->core.sd_attrs[core_no][i].dev_attr.attr.name =
>> + priv->core.attr_name[core_no][i];
>> + priv->core.sd_attrs[core_no][i].dev_attr.attr.mode = 0444;
>> + priv->core.sd_attrs[core_no][i].dev_attr.show = core_show_fn[i];
>> + if (i == 0 || i == 1) /* label or temp */
>> + priv->core.sd_attrs[core_no][i].index = core_no;
>> + priv->core.attrs[core_no][i] =
>> + &priv->core.sd_attrs[core_no][i].dev_attr.attr;
>> + }
>> +
>> + priv->core.attr_group[core_no].attrs = priv->core.attrs[core_no];
>> +
>> + return sysfs_create_group(&priv->hwmon_dev->kobj,
>> + &priv->core.attr_group[core_no]);
>> +}
>> +
>> +static ssize_t (*const dimm_show_fn[DIMM_TEMP_ATTRS]) (struct device *dev,
>> + struct device_attribute *devattr, char *buf) = {
>> + show_dimm_label,
>> + show_dimm_temp,
>> +};
>> +
>> +static const char *const dimm_suffix[DIMM_TEMP_ATTRS] = {
>> + "label",
>> + "input",
>> +};
>> +
>> +static int create_dimm_temp_group(struct peci_hwmon *priv, int dimm_no)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < DIMM_TEMP_ATTRS; i++) {
>> + snprintf(priv->dimm.attr_name[dimm_no][i],
>> + ATTR_NAME_LEN, "temp%d_%s",
>> + DIMM_INDEX_OFFSET + dimm_no, dimm_suffix[i]);
>> + sysfs_attr_init(&priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr);
>> + priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr.name =
>> + priv->dimm.attr_name[dimm_no][i];
>> + priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr.mode = 0444;
>> + priv->dimm.sd_attrs[dimm_no][i].dev_attr.show = dimm_show_fn[i];
>> + priv->dimm.sd_attrs[dimm_no][i].index = dimm_no;
>> + priv->dimm.attrs[dimm_no][i] =
>> + &priv->dimm.sd_attrs[dimm_no][i].dev_attr.attr;
>> + }
>> +
>> + priv->dimm.attr_group[dimm_no].attrs = priv->dimm.attrs[dimm_no];
>> +
>> + return sysfs_create_group(&priv->hwmon_dev->kobj,
>> + &priv->dimm.attr_group[dimm_no]);
>> +}
>> +
>> +static int peci_hwmon_probe(struct platform_device *pdev)
>> +{
>> + struct device *dev = &pdev->dev;
>> + struct device_node *np = dev->of_node;
>> + struct peci_hwmon *priv;
>> + struct device *hwmon;
>> + int rc, i;
>> +
>> + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
>> + if (!priv)
>> + return -ENOMEM;
>> +
>> + dev_set_drvdata(dev, priv);
>> + priv->dev = dev;
>> +
>> + rc = of_property_read_u32(np, "cpu-id", &priv->cpu_id);
>
> What entity determines cpu-id ?
>
CPU ID numbering is determined by hardware SOCKET_ID strap pins. In this
driver implementation, cpu-id is being used as CPU client indexing.
>> + if (rc || priv->cpu_id >= CPU_ID_MAX) {
>> + dev_err(dev, "Invalid cpu-id configuration\n");
>> + return rc;
>> + }
>> +
>> + rc = of_property_read_u32(np, "dimm-nums", &priv->dimm_nums);
>
> This is an odd devicetree attribute. Normally the number of DIMMs
> is dynamic. Isn't there a means to get all that information dynamically
> instead of having to set it through devicetree ? What if someone adds
> or removes a DIMM ? Who updates the devicetree ?
>
It means the number of DIMM slots each CPU has, doesn't mean the number
of currently installed DIMM components. If a DIMM is inserted a slot,
CPU reports its actual temperature but on empty slot, CPU reports 0
instead of reporting an error so it is the reason why this driver
enumerates all DIMM slots' attribute.
>> + if (rc || priv->dimm_nums > DIMM_NUMS_MAX) {
>> + dev_warn(dev, "Invalid dimm-nums : %u. Use default : %u\n",
>> + priv->dimm_nums, OF_DIMM_NUMS_DEFAULT);
>> + priv->dimm_nums = OF_DIMM_NUMS_DEFAULT;
>> + }
>> +
>> + priv->show_core = of_property_read_bool(np, "show-core");
>
> This does not look like an appropriate devicetree attribute.
>
Okay. I will remove this devicetree attribute and make this driver
enable core temperature attributes always.
>> +
>> + priv->groups = peci_hwmon_attr_groups;
>> +
>
> This assignment (and the ->groups variable) is quite pointless.
>
Will fix it.
>> + snprintf(priv->name, NAME_MAX, HWMON_NAME ".cpu%d", priv->cpu_id);
>> +
>> + hwmon = devm_hwmon_device_register_with_groups(dev,
>> + priv->name,
>> + priv, priv->groups);
>
> Please rewrite the driver to use devm_hwmon_device_register_with_info(),
> and avoid dynamic attributes.
>
I will rewrite it.
>> +
>> + rc = PTR_ERR_OR_ZERO(hwmon);
>> + if (rc != 0) {
>> + dev_err(dev, "Failed to register peci hwmon\n");
>> + return rc;
>> + }
>> +
>> + priv->hwmon_dev = hwmon;
>
> Something is logically wrong if you need to store hwmon_dev in the
> private data structure. Specifically, creating attributes dynamically
> after hwmon registration is wrong.
>
>> +
>> + for (i = 0; i < priv->dimm_nums; i++) {
>> + rc = create_dimm_temp_group(priv, i);
>
> No. See earlier comments. All attribute groups must be created during
> registration (or before, but I am not inclined to accept a new driver
> doing that).
>
>> + if (rc != 0) {
>> + dev_err(dev, "Failed to create dimm temp group\n");
>> + for (--i; i >= 0; i--) {
>> + sysfs_remove_group(&priv->hwmon_dev->kobj,
>> + &priv->dimm.attr_group[i]);
>> + }
>> + return rc;
>> + }
>> + }
>> +
>> + /*
>> + * Try to create core temp group now. It will be created if CPU is
>> + * curretnly online or it will be created after the first reading of
>> + * cpuinfo from the online CPU otherwise.
>
> This is not how CPUs are supposed to be detected, and it does not handle CPUs
> taken offline. If the driver is instantiated as a CPU comes online, or as it
> goes offline, the driver should use the appropriate kernel interfaces to
> trigger that instantiation or removal. However, if so, it may be inappropriate
> to associate CPU temperatures with other system temperatures in the same
> instance of the driver; after all, those are all independent of each other.
>
> Overall, I suspect that there should be a callback or some other mechanism
> in the peci core to trigger instantiation and removal of this driver, and
> I am not sure if any of the devicetree properties makes sense at all.
>
> For example, if an instance of this driver is associated with a PECI
> agent (with assorted CPU/DIMM temperature reporting), the instantiation
> could be triggered as soon as the PECI core detects that the agent is
> available, and the PECI core could report what exactly that instance
> supports.
>
Thanks for sharing your detailed comment. In fact, PECI doesn't have any
mechanism to get a callback for checking whether CPU is online or not.
The only way PECI can do is, polling a CPU using the Ping PECI command.
Also, a BMC controller can't make any PECI communication with offline
CPU so this implementation uses delayed creation for some attributes
which is messy as you said.
Like you suggested, a PECI agent could check CPU power state using some
other mechanism and this driver module could be dynamically
inserted/removed by the agent according to the CPU power state. This way
could make all attributes creation possible at probing time so no need
to use the delayed creation.
Also, I'll check feasibility of dynamic checking for maximum supportable
numbers on each component type so that we can dynamically set the values
as you suggested above.
>> + */
>> + if (priv->show_core)
>> + (void) get_cpuinfo(priv);
>> +
>> + dev_info(dev, "peci hwmon for CPU#%d registered\n", priv->cpu_id);
>
> Is this logging noise necessary ? Besides, some of it is redundant.
>
No, it isn't necessary. I will remove it.
>> +
>> + return rc;
>> +}
>> +
>> +static int peci_hwmon_remove(struct platform_device *pdev)
>> +{
>> + struct peci_hwmon *priv = dev_get_drvdata(&pdev->dev);
>> + int i;
>> +
>> + if (atomic_read(&priv->core_group_created))
>> + for (i = 0; i < priv->core_nums; i++) {
>> + sysfs_remove_group(&priv->hwmon_dev->kobj,
>> + &priv->core.attr_group[i]);
>> + }
>> +
>> + for (i = 0; i < priv->dimm_nums; i++) {
>> + sysfs_remove_group(&priv->hwmon_dev->kobj,
>> + &priv->dimm.attr_group[i]);
>> + }
>
> If you need to call sysfs_remove_group from here,
> something is conceptually wrong in your driver.
>
I'll remove it while rewriting the driver.
>> +
>> + return 0;
>> +}
>> +
>> +static const struct of_device_id peci_of_table[] = {
>> + { .compatible = "peci-hwmon", },
>
> This does not look like a reference to some piece of hardware.
>
This driver provides generic PECI hwmon function to which controller has
PECI HW such as Aspeed or Nuvoton BMC chip so it's not dependant on a
specific hardware. Should I remove this or any suggestion?
>> + { }
>> +};
>> +MODULE_DEVICE_TABLE(of, peci_of_table);
>> +
>> +static struct platform_driver peci_hwmon_driver = {
>> + .probe = peci_hwmon_probe,
>> + .remove = peci_hwmon_remove,
>> + .driver = {
>> + .name = DEVICE_NAME,
>> + .of_match_table = peci_of_table,
>> + },
>> +};
>> +
>> +module_platform_driver(peci_hwmon_driver);
>> +MODULE_AUTHOR("Jae Hyun Yoo <jae.hyun.yoo at linux.intel.com>");
>> +MODULE_DESCRIPTION("PECI hwmon driver");
>> +MODULE_LICENSE("GPL v2");
More information about the openbmc
mailing list