[PATCH v3] powerpc/papr_scm: Implement initial support for injecting smart errors
Aneesh Kumar K.V
aneesh.kumar at linux.ibm.com
Mon Jan 17 19:30:42 AEDT 2022
Vaibhav Jain <vaibhav at linux.ibm.com> writes:
> Presently PAPR doesn't support injecting smart errors on an
> NVDIMM. This makes testing the NVDIMM health reporting functionality
> difficult as simulating NVDIMM health related events need a hacked up
> qemu version.
>
> To solve this problem this patch proposes simulating certain set of
> NVDIMM health related events in papr_scm. Specifically 'fatal' health
> state and 'dirty' shutdown state. These error can be injected via the
> user-space 'ndctl-inject-smart(1)' command. With the proposed patch and
> corresponding ndctl patches following command flow is expected:
>
> $ sudo ndctl list -DH -d nmem0
> ...
> "health_state":"ok",
> "shutdown_state":"clean",
> ...
> # inject unsafe shutdown and fatal health error
> $ sudo ndctl inject-smart nmem0 -Uf
> ...
> "health_state":"fatal",
> "shutdown_state":"dirty",
> ...
> # uninject all errors
> $ sudo ndctl inject-smart nmem0 -N
> ...
> "health_state":"ok",
> "shutdown_state":"clean",
> ...
>
> The patch adds two members 'health_bitmap_mask' and
> 'health_bitmap_override' inside struct papr_scm_priv which are then
> bit blt'ed to the health bitmaps fetched from the hypervisor. In case
> we are not able to fetch health information from the hypervisor we
> service the health bitmap from these two members. These members are
> accessible from sysfs at nmemX/papr/health_bitmap_override
>
> A new PDSM named 'SMART_INJECT' is proposed that accepts newly
> introduced 'struct nd_papr_pdsm_smart_inject' as payload thats
> exchanged between libndctl and papr_scm to indicate the requested
> smart-error states.
>
> When the processing the PDSM 'SMART_INJECT', papr_pdsm_smart_inject()
> constructs a pair or 'mask' and 'override' bitmaps from the payload
> and bit-blt it to the 'health_bitmap_{mask, override}' members. This
> ensures the after being fetched from the hypervisor, the health_bitmap
> reflects requested smart-error states.
>
> Signed-off-by: Vaibhav Jain <vaibhav at linux.ibm.com>
> Signed-off-by: Shivaprasad G Bhat <sbhat at linux.ibm.com>
> ---
> Changelog:
>
> Since v2:
> * Rebased the patch to ppc-next
> * Added documentation for newly introduced sysfs attribute 'health_bitmap_override'
>
> Since v1:
> * Updated the patch description.
> * Removed dependency of a header movement patch.
> * Removed '__packed' attribute for 'struct nd_papr_pdsm_smart_inject' [Aneesh]
> ---
> Documentation/ABI/testing/sysfs-bus-papr-pmem | 13 +++
> arch/powerpc/include/uapi/asm/papr_pdsm.h | 18 ++++
> arch/powerpc/platforms/pseries/papr_scm.c | 94 ++++++++++++++++++-
> 3 files changed, 122 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-bus-papr-pmem b/Documentation/ABI/testing/sysfs-bus-papr-pmem
> index 95254cec92bf..8a0b2a7f7671 100644
> --- a/Documentation/ABI/testing/sysfs-bus-papr-pmem
> +++ b/Documentation/ABI/testing/sysfs-bus-papr-pmem
> @@ -61,3 +61,16 @@ Description:
> * "CchRHCnt" : Cache Read Hit Count
> * "CchWHCnt" : Cache Write Hit Count
> * "FastWCnt" : Fast Write Count
> +
> +What: /sys/bus/nd/devices/nmemX/papr/health_bitmap_override
> +Date: Jan, 2022
> +KernelVersion: v5.17
> +Contact: linuxppc-dev <linuxppc-dev at lists.ozlabs.org>, nvdimm at lists.linux.dev,
> +Description:
> + (RO) Reports the health bitmap override/mask bitmaps that are
> + applied to top bitmap received from PowerVM via the H_SCM_HEALTH
> + Hcall. Together these can be used to forcibly set/reset specific
> + bits returned from Hcall. These bitmaps are presently used to
> + simulate various health or shutdown states for an nvdimm and are
> + set by user-space tools like ndctl by issuing a PAPR DSM.
> +
> diff --git a/arch/powerpc/include/uapi/asm/papr_pdsm.h b/arch/powerpc/include/uapi/asm/papr_pdsm.h
> index 82488b1e7276..17439925045c 100644
> --- a/arch/powerpc/include/uapi/asm/papr_pdsm.h
> +++ b/arch/powerpc/include/uapi/asm/papr_pdsm.h
> @@ -116,6 +116,22 @@ struct nd_papr_pdsm_health {
> };
> };
>
> +/* Flags for injecting specific smart errors */
> +#define PDSM_SMART_INJECT_HEALTH_FATAL (1 << 0)
> +#define PDSM_SMART_INJECT_BAD_SHUTDOWN (1 << 1)
> +
> +struct nd_papr_pdsm_smart_inject {
> + union {
> + struct {
> + /* One or more of PDSM_SMART_INJECT_ */
> + __u32 flags;
> + __u8 fatal_enable;
> + __u8 unsafe_shutdown_enable;
> + };
> + __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE];
> + };
> +};
> +
> /*
> * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel
> * via 'nd_cmd_pkg.nd_command' member of the ioctl struct
> @@ -123,12 +139,14 @@ struct nd_papr_pdsm_health {
> enum papr_pdsm {
> PAPR_PDSM_MIN = 0x0,
> PAPR_PDSM_HEALTH,
> + PAPR_PDSM_SMART_INJECT,
> PAPR_PDSM_MAX,
> };
>
> /* Maximal union that can hold all possible payload types */
> union nd_pdsm_payload {
> struct nd_papr_pdsm_health health;
> + struct nd_papr_pdsm_smart_inject smart_inject;
> __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE];
> } __packed;
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
> index f48e87ac89c9..de4cf329cfb3 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -68,6 +68,10 @@
> #define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS)
> #define PAPR_SCM_PERF_STATS_VERSION 0x1
>
> +/* Use bitblt method to override specific bits in the '_bitmap_' */
> +#define BITBLT_BITMAP(_bitmap_, _mask_, _override_) \
> + (((_bitmap_) & ~(_mask_)) | ((_mask_) & (_override_)))
> +
> /* Struct holding a single performance metric */
> struct papr_scm_perf_stat {
> u8 stat_id[8];
> @@ -120,6 +124,12 @@ struct papr_scm_priv {
>
> /* length of the stat buffer as expected by phyp */
> size_t stat_buffer_len;
> +
> + /* The bits which needs to be overridden */
> + u64 health_bitmap_mask;
> +
> + /* The overridden values for the bits having the masks set */
> + u64 health_bitmap_override;
> };
>
> static int papr_scm_pmem_flush(struct nd_region *nd_region,
> @@ -347,19 +357,28 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p,
> static int __drc_pmem_query_health(struct papr_scm_priv *p)
> {
> unsigned long ret[PLPAR_HCALL_BUFSIZE];
> + u64 bitmap = 0;
> long rc;
>
> /* issue the hcall */
> rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index);
> - if (rc != H_SUCCESS) {
> + if (rc == H_SUCCESS)
> + bitmap = ret[0] & ret[1];
> + else if (rc == H_FUNCTION)
> + dev_info_once(&p->pdev->dev,
> + "Hcall H_SCM_HEALTH not implemented, assuming empty health bitmap");
> + else {
> +
> dev_err(&p->pdev->dev,
> "Failed to query health information, Err:%ld\n", rc);
> return -ENXIO;
> }
>
> p->lasthealth_jiffies = jiffies;
> - p->health_bitmap = ret[0] & ret[1];
> -
> + /* Allow overriding specific health bits via bit blt. */
> + bitmap = BITBLT_BITMAP(bitmap, p->health_bitmap_mask,
> + p->health_bitmap_override);
> + WRITE_ONCE(p->health_bitmap, bitmap);
Why WRITE_ONCE ?
> dev_dbg(&p->pdev->dev,
> "Queried dimm health info. Bitmap:0x%016lx Mask:0x%016lx\n",
> ret[0], ret[1]);
> @@ -669,6 +688,54 @@ static int papr_pdsm_health(struct papr_scm_priv *p,
> return rc;
> }
>
> +/* Inject a smart error Add the dirty-shutdown-counter value to the pdsm */
> +static int papr_pdsm_smart_inject(struct papr_scm_priv *p,
> + union nd_pdsm_payload *payload)
> +{
> + int rc;
> + u32 supported_flags = 0;
> + u64 mask = 0, override = 0;
> +
> + /* Check for individual smart error flags and update mask and override */
> + if (payload->smart_inject.flags & PDSM_SMART_INJECT_HEALTH_FATAL) {
> + supported_flags |= PDSM_SMART_INJECT_HEALTH_FATAL;
> + mask |= PAPR_PMEM_HEALTH_FATAL;
> + override |= payload->smart_inject.fatal_enable ?
> + PAPR_PMEM_HEALTH_FATAL : 0;
> + }
> +
> + if (payload->smart_inject.flags & PDSM_SMART_INJECT_BAD_SHUTDOWN) {
> + supported_flags |= PDSM_SMART_INJECT_BAD_SHUTDOWN;
> + mask |= PAPR_PMEM_SHUTDOWN_DIRTY;
> + override |= payload->smart_inject.unsafe_shutdown_enable ?
> + PAPR_PMEM_SHUTDOWN_DIRTY : 0;
> + }
> +
> + dev_dbg(&p->pdev->dev, "[Smart-inject] Mask=%#llx override=%#llx\n",
> + mask, override);
> +
> + /* Prevent concurrent access to dimm health bitmap related members */
> + rc = mutex_lock_interruptible(&p->health_mutex);
> + if (rc)
> + return rc;
> +
> + /* Bitblt mask/override to corrosponding health_bitmap couterparts */
> + p->health_bitmap_mask = BITBLT_BITMAP(p->health_bitmap_mask,
> + mask, override);
is that correct? Should we do that mask & override ? Shouldn't this be
p->health_bitmap_mask = BITBLT_BITMAP(p->health_bitmap_mask,
mask, ~0x0UL);
> + p->health_bitmap_override = BITBLT_BITMAP(p->health_bitmap_override,
> + mask, override);
> +
> + /* Invalidate cached health bitmap */
> + p->lasthealth_jiffies = 0;
> +
> + mutex_unlock(&p->health_mutex);
> +
> + /* Return the supported flags back to userspace */
> + payload->smart_inject.flags = supported_flags;
> +
> + return sizeof(struct nd_papr_pdsm_health);
> +}
> +
> /*
> * 'struct pdsm_cmd_desc'
> * Identifies supported PDSMs' expected length of in/out payloads
> @@ -702,6 +769,12 @@ static const struct pdsm_cmd_desc __pdsm_cmd_descriptors[] = {
> .size_out = sizeof(struct nd_papr_pdsm_health),
> .service = papr_pdsm_health,
> },
> +
> + [PAPR_PDSM_SMART_INJECT] = {
> + .size_in = sizeof(struct nd_papr_pdsm_smart_inject),
> + .size_out = sizeof(struct nd_papr_pdsm_smart_inject),
> + .service = papr_pdsm_smart_inject,
> + },
> /* Empty */
> [PAPR_PDSM_MAX] = {
> .size_in = 0,
> @@ -838,6 +911,20 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
> return 0;
> }
>
> +static ssize_t health_bitmap_override_show(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct nvdimm *dimm = to_nvdimm(dev);
> + struct papr_scm_priv *p = nvdimm_provider_data(dimm);
> +
> + return sprintf(buf, "mask=%#llx override=%#llx\n",
> + READ_ONCE(p->health_bitmap_mask),
> + READ_ONCE(p->health_bitmap_override));
> +}
> +
> +static DEVICE_ATTR_ADMIN_RO(health_bitmap_override);
> +
> static ssize_t perf_stats_show(struct device *dev,
> struct device_attribute *attr, char *buf)
> {
> @@ -952,6 +1039,7 @@ static struct attribute *papr_nd_attributes[] = {
> &dev_attr_flags.attr,
> &dev_attr_perf_stats.attr,
> &dev_attr_dirty_shutdown.attr,
> + &dev_attr_health_bitmap_override.attr,
> NULL,
> };
>
> --
> 2.34.1
More information about the Linuxppc-dev
mailing list