[PATCH 12/14] hwmon: peci: Add dimmtemp driver

Winiarska, Iwona iwona.winiarska at intel.com
Sat Jul 31 08:48:07 AEST 2021


On Mon, 2021-07-26 at 22:08 +0000, Zev Weiss wrote:
> On Mon, Jul 12, 2021 at 05:04:45PM CDT, Iwona Winiarska wrote:
> > Add peci-dimmtemp driver for Digital Thermal Sensor (DTS) thermal
> > readings of DIMMs that are accessible via the processor PECI interface.
> > 
> > The main use case for the driver (and PECI interface) is out-of-band
> > management, where we're able to obtain the DTS readings from an external
> > entity connected with PECI, e.g. BMC on server platforms.
> > 
> > Co-developed-by: Jae Hyun Yoo <jae.hyun.yoo at linux.intel.com>
> > Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo at linux.intel.com>
> > Signed-off-by: Iwona Winiarska <iwona.winiarska at intel.com>
> > Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart at linux.intel.com>
> > ---
> > drivers/hwmon/peci/Kconfig    |  13 +
> > drivers/hwmon/peci/Makefile   |   2 +
> > drivers/hwmon/peci/dimmtemp.c | 508 ++++++++++++++++++++++++++++++++++
> > 3 files changed, 523 insertions(+)
> > create mode 100644 drivers/hwmon/peci/dimmtemp.c
> > 
> > diff --git a/drivers/hwmon/peci/Kconfig b/drivers/hwmon/peci/Kconfig
> > index e10eed68d70a..f2d57efa508b 100644
> > --- a/drivers/hwmon/peci/Kconfig
> > +++ b/drivers/hwmon/peci/Kconfig
> > @@ -14,5 +14,18 @@ config SENSORS_PECI_CPUTEMP
> >           This driver can also be built as a module. If so, the module
> >           will be called peci-cputemp.
> > 
> > +config SENSORS_PECI_DIMMTEMP
> > +       tristate "PECI DIMM temperature monitoring client"
> > +       depends on PECI
> > +       select SENSORS_PECI
> > +       select PECI_CPU
> > +       help
> > +         If you say yes here you get support for the generic Intel PECI hwmon
> > +         driver which provides Digital Thermal Sensor (DTS) thermal readings
> > of
> > +         DIMM components that are accessible via the processor PECI
> > interface.
> > +
> > +         This driver can also be built as a module. If so, the module
> > +         will be called peci-dimmtemp.
> > +
> > config SENSORS_PECI
> >         tristate
> > diff --git a/drivers/hwmon/peci/Makefile b/drivers/hwmon/peci/Makefile
> > index e8a0ada5ab1f..191cfa0227f3 100644
> > --- a/drivers/hwmon/peci/Makefile
> > +++ b/drivers/hwmon/peci/Makefile
> > @@ -1,5 +1,7 @@
> > # SPDX-License-Identifier: GPL-2.0-only
> > 
> > peci-cputemp-y := cputemp.o
> > +peci-dimmtemp-y := dimmtemp.o
> > 
> > obj-$(CONFIG_SENSORS_PECI_CPUTEMP)      += peci-cputemp.o
> > +obj-$(CONFIG_SENSORS_PECI_DIMMTEMP)    += peci-dimmtemp.o
> > diff --git a/drivers/hwmon/peci/dimmtemp.c b/drivers/hwmon/peci/dimmtemp.c
> > new file mode 100644
> > index 000000000000..2fcb8607137a
> > --- /dev/null
> > +++ b/drivers/hwmon/peci/dimmtemp.c
> > @@ -0,0 +1,508 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +// Copyright (c) 2018-2021 Intel Corporation
> > +
> > +#include <linux/auxiliary_bus.h>
> > +#include <linux/bitfield.h>
> > +#include <linux/bitops.h>
> > +#include <linux/hwmon.h>
> > +#include <linux/jiffies.h>
> > +#include <linux/module.h>
> > +#include <linux/peci.h>
> > +#include <linux/peci-cpu.h>
> > +#include <linux/units.h>
> > +#include <linux/workqueue.h>
> > +#include <linux/x86/intel-family.h>
> > +
> > +#include "common.h"
> > +
> > +#define DIMM_MASK_CHECK_DELAY_JIFFIES  msecs_to_jiffies(5000)
> > +#define DIMM_MASK_CHECK_RETRY_MAX      60 /* 60 x 5 secs = 5 minutes */
> > +
> > +/* Max number of channel ranks and DIMM index per channel */
> > +#define CHAN_RANK_MAX_ON_HSX   8
> > +#define DIMM_IDX_MAX_ON_HSX    3
> > +#define CHAN_RANK_MAX_ON_BDX   4
> > +#define DIMM_IDX_MAX_ON_BDX    3
> > +#define CHAN_RANK_MAX_ON_BDXD  2
> > +#define DIMM_IDX_MAX_ON_BDXD   2
> > +#define CHAN_RANK_MAX_ON_SKX   6
> > +#define DIMM_IDX_MAX_ON_SKX    2
> > +#define CHAN_RANK_MAX_ON_ICX   8
> > +#define DIMM_IDX_MAX_ON_ICX    2
> > +#define CHAN_RANK_MAX_ON_ICXD  4
> > +#define DIMM_IDX_MAX_ON_ICXD   2
> > +
> > +#define CHAN_RANK_MAX          CHAN_RANK_MAX_ON_HSX
> > +#define DIMM_IDX_MAX           DIMM_IDX_MAX_ON_HSX
> > +#define DIMM_NUMS_MAX          (CHAN_RANK_MAX * DIMM_IDX_MAX)
> 
> Should we perhaps have a static_assert(DIMM_NUMS_MAX <= 64) so that
> check_populated_dimms() doesn't silently break if we ever have a system
> with > 64 dimms?  (Not sure how far off that might be, but doesn't seem
> *that* wildly inconceivable, anyway.)

Ack, I'll add an assert and warn in check_populated_dimms():

BUILD_BUG_ON(DIMM_NUMS_MAX > 64);
if (chan_rank_max * dimm_idx_max > DIMM_NUMS_MAX) {
        WARN_ONCE(1, "Unsupported number of DIMMs");
        return -EINVAL;
}

> 
> On a similar note, it'd be nice if there were some neat way of
> automating the maintenance of CHAN_RANK_MAX and DIMM_IDX_MAX, but I
> don't know of any great solutions for that offhand.  (Shrug.)

With the added WARN it should be easy enough to be catch it without being an
issue.

> 
> > +
> > +#define CPU_SEG_MASK           GENMASK(23, 16)
> > +#define GET_CPU_SEG(x)         (((x) & CPU_SEG_MASK) >> 16)
> > +#define CPU_BUS_MASK           GENMASK(7, 0)
> > +#define GET_CPU_BUS(x)         ((x) & CPU_BUS_MASK)
> > +
> > +#define DIMM_TEMP_MAX          GENMASK(15, 8)
> > +#define DIMM_TEMP_CRIT         GENMASK(23, 16)
> > +#define GET_TEMP_MAX(x)                (((x) & DIMM_TEMP_MAX) >> 8)
> > +#define GET_TEMP_CRIT(x)       (((x) & DIMM_TEMP_CRIT) >> 16)
> > +
> > +struct dimm_info {
> > +       int chan_rank_max;
> > +       int dimm_idx_max;
> > +       u8 min_peci_revision;
> 
> This field doesn't seem to be used for anything that I can see; is it
> really needed?

Just like in cputemp - for device sanity check.

> 
> > +};
> > +
> > +struct peci_dimmtemp {
> > +       struct peci_device *peci_dev;
> > +       struct device *dev;
> > +       const char *name;
> > +       const struct dimm_info *gen_info;
> > +       struct delayed_work detect_work;
> > +       struct peci_sensor_data temp[DIMM_NUMS_MAX];
> > +       long temp_max[DIMM_NUMS_MAX];
> > +       long temp_crit[DIMM_NUMS_MAX];
> > +       int retry_count;
> > +       char **dimmtemp_label;
> > +       DECLARE_BITMAP(dimm_mask, DIMM_NUMS_MAX);
> > +};
> > +
> > +static u8 __dimm_temp(u32 reg, int dimm_order)
> > +{
> > +       return (reg >> (dimm_order * 8)) & 0xff;
> > +}
> > +
> > +static int get_dimm_temp(struct peci_dimmtemp *priv, int dimm_no)
> > +{
> > +       int dimm_order = dimm_no % priv->gen_info->dimm_idx_max;
> > +       int chan_rank = dimm_no / priv->gen_info->dimm_idx_max;
> > +       struct peci_device *peci_dev = priv->peci_dev;
> > +       u8 cpu_seg, cpu_bus, dev, func;
> > +       u64 offset;
> > +       u32 data;
> > +       u16 reg;
> > +       int ret;
> > +
> > +       if (!peci_sensor_need_update(&priv->temp[dimm_no]))
> > +               return 0;
> > +
> > +       ret = peci_pcs_read(peci_dev, PECI_PCS_DDR_DIMM_TEMP, chan_rank,
> > &data);
> > +       if (ret)
> > +               return ret;
> > +
> > +       priv->temp[dimm_no].value = __dimm_temp(data, dimm_order) *
> > MILLIDEGREE_PER_DEGREE;
> > +
> > +       switch (peci_dev->info.model) {
> > +       case INTEL_FAM6_ICELAKE_X:
> > +       case INTEL_FAM6_ICELAKE_D:
> > +               ret = peci_ep_pci_local_read(peci_dev, 0, 13, 0, 2, 0xd4,
> > &data);
> > +               if (ret || !(data & BIT(31)))
> > +                       break; /* Use default or previous value */
> > +
> > +               ret = peci_ep_pci_local_read(peci_dev, 0, 13, 0, 2, 0xd0,
> > &data);
> > +               if (ret)
> > +                       break; /* Use default or previous value */
> > +
> > +               cpu_seg = GET_CPU_SEG(data);
> > +               cpu_bus = GET_CPU_BUS(data);
> > +
> > +               /*
> > +                * Device 26, Offset 224e0: IMC 0 channel 0 -> rank 0
> > +                * Device 26, Offset 264e0: IMC 0 channel 1 -> rank 1
> > +                * Device 27, Offset 224e0: IMC 1 channel 0 -> rank 2
> > +                * Device 27, Offset 264e0: IMC 1 channel 1 -> rank 3
> > +                * Device 28, Offset 224e0: IMC 2 channel 0 -> rank 4
> > +                * Device 28, Offset 264e0: IMC 2 channel 1 -> rank 5
> > +                * Device 29, Offset 224e0: IMC 3 channel 0 -> rank 6
> > +                * Device 29, Offset 264e0: IMC 3 channel 1 -> rank 7
> > +                */
> > +               dev = 0x1a + chan_rank / 2;
> > +               offset = 0x224e0 + dimm_order * 4;
> > +               if (chan_rank % 2)
> > +                       offset += 0x4000;
> > +
> > +               ret = peci_mmio_read(peci_dev, 0, cpu_seg, cpu_bus, dev, 0,
> > offset, &data);
> > +               if (ret)
> > +                       return ret;
> > +
> > +               priv->temp_max[dimm_no] = GET_TEMP_MAX(data) *
> > MILLIDEGREE_PER_DEGREE;
> > +               priv->temp_crit[dimm_no] = GET_TEMP_CRIT(data) *
> > MILLIDEGREE_PER_DEGREE;
> 
> These two lines look identical in all (non-default) cases; should we
> deduplicate them by just moving them to after the switch?

I'll refactor this.

> 
> > +
> > +               break;
> > +       case INTEL_FAM6_SKYLAKE_X:
> > +               /*
> > +                * Device 10, Function 2: IMC 0 channel 0 -> rank 0
> > +                * Device 10, Function 6: IMC 0 channel 1 -> rank 1
> > +                * Device 11, Function 2: IMC 0 channel 2 -> rank 2
> > +                * Device 12, Function 2: IMC 1 channel 0 -> rank 3
> > +                * Device 12, Function 6: IMC 1 channel 1 -> rank 4
> > +                * Device 13, Function 2: IMC 1 channel 2 -> rank 5
> > +                */
> > +               dev = 10 + chan_rank / 3 * 2 + (chan_rank % 3 == 2 ? 1 : 0);
> > +               func = chan_rank % 3 == 1 ? 6 : 2;
> > +               reg = 0x120 + dimm_order * 4;
> > +
> > +               ret = peci_pci_local_read(peci_dev, 2, dev, func, reg,
> > &data);
> > +               if (ret)
> > +                       return ret;
> > +
> > +               priv->temp_max[dimm_no] = GET_TEMP_MAX(data) *
> > MILLIDEGREE_PER_DEGREE;
> > +               priv->temp_crit[dimm_no] = GET_TEMP_CRIT(data) *
> > MILLIDEGREE_PER_DEGREE;
> > +
> > +               break;
> > +       case INTEL_FAM6_BROADWELL_D:
> > +               /*
> > +                * Device 10, Function 2: IMC 0 channel 0 -> rank 0
> > +                * Device 10, Function 6: IMC 0 channel 1 -> rank 1
> > +                * Device 12, Function 2: IMC 1 channel 0 -> rank 2
> > +                * Device 12, Function 6: IMC 1 channel 1 -> rank 3
> > +                */
> > +               dev = 10 + chan_rank / 2 * 2;
> > +               func = (chan_rank % 2) ? 6 : 2;
> > +               reg = 0x120 + dimm_order * 4;
> > +
> > +               ret = peci_pci_local_read(peci_dev, 2, dev, func, reg,
> > &data);
> > +               if (ret)
> > +                       return ret;
> > +
> > +               priv->temp_max[dimm_no] = GET_TEMP_MAX(data) *
> > MILLIDEGREE_PER_DEGREE;
> > +               priv->temp_crit[dimm_no] = GET_TEMP_CRIT(data) *
> > MILLIDEGREE_PER_DEGREE;
> > +
> > +               break;
> > +       case INTEL_FAM6_HASWELL_X:
> > +       case INTEL_FAM6_BROADWELL_X:
> > +               /*
> > +                * Device 20, Function 0: IMC 0 channel 0 -> rank 0
> > +                * Device 20, Function 1: IMC 0 channel 1 -> rank 1
> > +                * Device 21, Function 0: IMC 0 channel 2 -> rank 2
> > +                * Device 21, Function 1: IMC 0 channel 3 -> rank 3
> > +                * Device 23, Function 0: IMC 1 channel 0 -> rank 4
> > +                * Device 23, Function 1: IMC 1 channel 1 -> rank 5
> > +                * Device 24, Function 0: IMC 1 channel 2 -> rank 6
> > +                * Device 24, Function 1: IMC 1 channel 3 -> rank 7
> > +                */
> > +               dev = 20 + chan_rank / 2 + chan_rank / 4;
> > +               func = chan_rank % 2;
> > +               reg = 0x120 + dimm_order * 4;
> > +
> > +               ret = peci_pci_local_read(peci_dev, 1, dev, func, reg,
> > &data);
> > +               if (ret)
> > +                       return ret;
> > +
> > +               priv->temp_max[dimm_no] = GET_TEMP_MAX(data) *
> > MILLIDEGREE_PER_DEGREE;
> > +               priv->temp_crit[dimm_no] = GET_TEMP_CRIT(data) *
> > MILLIDEGREE_PER_DEGREE;
> > +
> > +               break;
> > +       default:
> > +               return -EOPNOTSUPP;
> > +       }
> > +
> > +       peci_sensor_mark_updated(&priv->temp[dimm_no]);
> > +
> > +       return 0;
> > +}
> > +
> > +static int dimmtemp_read_string(struct device *dev,
> > +                               enum hwmon_sensor_types type,
> > +                               u32 attr, int channel, const char **str)
> > +{
> > +       struct peci_dimmtemp *priv = dev_get_drvdata(dev);
> > +
> > +       if (attr != hwmon_temp_label)
> > +               return -EOPNOTSUPP;
> > +
> > +       *str = (const char *)priv->dimmtemp_label[channel];
> > +
> > +       return 0;
> > +}
> > +
> > +static int dimmtemp_read(struct device *dev, enum hwmon_sensor_types type,
> > +                        u32 attr, int channel, long *val)
> > +{
> > +       struct peci_dimmtemp *priv = dev_get_drvdata(dev);
> > +       int ret;
> > +
> > +       ret = get_dimm_temp(priv, channel);
> > +       if (ret)
> > +               return ret;
> > +
> > +       switch (attr) {
> > +       case hwmon_temp_input:
> > +               *val = priv->temp[channel].value;
> > +               break;
> > +       case hwmon_temp_max:
> > +               *val = priv->temp_max[channel];
> > +               break;
> > +       case hwmon_temp_crit:
> > +               *val = priv->temp_crit[channel];
> > +               break;
> > +       default:
> > +               return -EOPNOTSUPP;
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +static umode_t dimmtemp_is_visible(const void *data, enum
> > hwmon_sensor_types type,
> > +                                  u32 attr, int channel)
> > +{
> > +       const struct peci_dimmtemp *priv = data;
> > +
> > +       if (test_bit(channel, priv->dimm_mask))
> > +               return 0444;
> > +
> > +       return 0;
> > +}
> > +
> > +static const struct hwmon_ops peci_dimmtemp_ops = {
> > +       .is_visible = dimmtemp_is_visible,
> > +       .read_string = dimmtemp_read_string,
> > +       .read = dimmtemp_read,
> > +};
> > +
> > +static int check_populated_dimms(struct peci_dimmtemp *priv)
> > +{
> > +       int chan_rank_max = priv->gen_info->chan_rank_max;
> > +       int dimm_idx_max = priv->gen_info->dimm_idx_max;
> > +       int chan_rank, dimm_idx, ret;
> > +       u64 dimm_mask = 0;
> > +       u32 pcs;
> > +
> > +       for (chan_rank = 0; chan_rank < chan_rank_max; chan_rank++) {
> > +               ret = peci_pcs_read(priv->peci_dev, PECI_PCS_DDR_DIMM_TEMP,
> > chan_rank, &pcs);
> > +               if (ret) {
> > +                       /*
> > +                        * Overall, we expect either success or -EINVAL in
> > +                        * order to determine whether DIMM is populated or
> > not.
> > +                        * For anything else - we fall back to defering the
> > +                        * detection to be performed at a later point in
> > time.
> > +                        */
> > +                       if (ret == -EINVAL)
> > +                               continue;
> > +                       else
> > +                               return -EAGAIN;
> > +               }
> > +
> > +               for (dimm_idx = 0; dimm_idx < dimm_idx_max; dimm_idx++)
> > +                       if (__dimm_temp(pcs, dimm_idx))
> > +                               dimm_mask |= BIT(chan_rank * dimm_idx_max +
> > dimm_idx);
> > +       }
> > +       /*
> > +        * It's possible that memory training is not done yet. In this case
> > we
> > +        * defer the detection to be performed at a later point in time.
> > +        */
> > +       if (!dimm_mask)
> > +               return -EAGAIN;
> > +
> > +       dev_dbg(priv->dev, "Scanned populated DIMMs: %#llx\n", dimm_mask);
> 
> Hmm, though aside from this one debug print it seems like this function
> could just as easily operate directly on priv->dimm_mask if we wanted to
> make it safe for >64 dimms (I have no particular objection to keeping it
> as-is for now though).

I'll leave it as is for now.

Thanks
-Iwona


More information about the openbmc mailing list