[Skiboot] [PATCH V2 7/7] hmi: Add handling for NPU checkstops
Alistair Popple
alistair at popple.id.au
Thu Mar 24 14:07:58 AEDT 2016
On Mon, 21 Mar 2016 12:00:06 Russell Currey wrote:
> If the NPU detects an unrecoverable error, it will send a HMI. This is
> problematic since unhandled HMIs will checkstop the entire system, which
> is not the intended behaviour of a NPU failure. Instead, the NPU
> emulated PCI devices should be fenced as part of EEH.
>
> Add support for handling NPU HMIs. This works by finding the NPU
> responsible for the HMI, checking its error registers, and sending a
> recoverable HMI event. The NPU itself cannot actually recover, but the
> system should not be brought down. Fence mode is set on the NPU, such
> that any further operations on the NPU will trigger EEH, and it will be
> subsequently fenced from the system.
>
> Signed-off-by: Russell Currey <ruscur at russell.cc>
Reviewed-by: Alistair Popple <alistair at popple.id.au>
> ---
> V2:
> - use for_each_phb macro, dealing with non-continuous PHB IDs
> - add a comment mentioning a future >1 NPU per chip case
> - move the P8NVL check into the function, looks cleaner
> ---
> core/hmi.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
> include/opal-api.h | 1 +
> 2 files changed, 72 insertions(+), 1 deletion(-)
>
> diff --git a/core/hmi.c b/core/hmi.c
> index a4c7869..5d8020a 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -24,6 +24,8 @@
> #include <pci.h>
> #include <cpu.h>
> #include <chip.h>
> +#include <npu-regs.h>
> +#include <npu.h>
>
> /*
> * HMER register layout:
> @@ -439,6 +441,74 @@ static void find_nx_checkstop_reason(int flat_chip_id,
> *event_generated = 1;
> }
>
> +static void find_npu_checkstop_reason(int flat_chip_id,
> + struct OpalHMIEvent *hmi_evt,
> + int *event_generated)
> +{
> + struct phb *phb;
> + struct npu *p = NULL;
> +
> + uint64_t npu_fir;
> + uint64_t npu_fir_mask;
> + uint64_t npu_fir_action0;
> + uint64_t npu_fir_action1;
> + uint64_t fatal_errors;
> +
> + /* Only check for NPU errors if the chip has a NPU */
> + if (PVR_TYPE(mfspr(SPR_PVR)) != PVR_TYPE_P8NVL)
> + return;
> +
> + /* Find the NPU on the chip associated with the HMI. */
> + for_each_phb(phb) {
> + /* NOTE: if a chip ever has >1 NPU this will need adjusting */
> + if (dt_node_is_compatible(phb->dt_node, "ibm,power8-npu-pciex") &&
> + (dt_get_chip_id(phb->dt_node) == flat_chip_id)) {
> + p = phb_to_npu(phb);
> + break;
> + }
> + }
> +
> + /* If we didn't find a NPU on the chip, it's not our checkstop. */
> + if (p == NULL)
> + return;
> +
> + /* Read all the registers necessary to find a checkstop condition. */
> + if (xscom_read(flat_chip_id,
> + p->at_xscom + NX_FIR, &npu_fir) ||
> + xscom_read(flat_chip_id,
> + p->at_xscom + NX_FIR_MASK, &npu_fir_mask) ||
> + xscom_read(flat_chip_id,
> + p->at_xscom + NX_FIR_ACTION0, &npu_fir_action0) ||
> + xscom_read(flat_chip_id,
> + p->at_xscom + NX_FIR_ACTION1, &npu_fir_action1)) {
> + prerror("HMI: Couldn't read NPU registers with XSCOM\n");
> + return;
> + }
> +
> + fatal_errors = npu_fir & ~npu_fir_mask & npu_fir_action0 & npu_fir_action1;
> +
> + /* If there's no errors, we don't need to do anything. */
> + if (!fatal_errors)
> + return;
> +
> + prlog(PR_DEBUG,
> + "NPU: FIR %llx FIR mask %llx FIR ACTION0 %llx FIR ACTION1 %llx\n",
> + npu_fir, npu_fir_mask, npu_fir_action0, npu_fir_action1);
> +
> + /* Set the NPU to fenced since it can't recover. */
> + p->fenced = true;
> +
> + /* Set up the HMI event */
> + hmi_evt->severity = OpalHMI_SEV_WARNING;
> + hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
> + hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
> + hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
> +
> + /* The HMI is "recoverable" because it shouldn't crash the system */
> + queue_hmi_event(hmi_evt, 1);
> + *event_generated = 1;
> +}
> +
> static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
> {
> int i;
> @@ -456,8 +526,8 @@ static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
> queue_hmi_event(hmi_evt, recover);
> event_generated = 1;
> }
> -
> find_nx_checkstop_reason(i, hmi_evt, &event_generated);
> + find_npu_checkstop_reason(i, hmi_evt, &event_generated);
> }
>
> if (recover != -1) {
> diff --git a/include/opal-api.h b/include/opal-api.h
> index 369aa93..0b7b0bb 100644
> --- a/include/opal-api.h
> +++ b/include/opal-api.h
> @@ -577,6 +577,7 @@ enum OpalHMI_XstopType {
> CHECKSTOP_TYPE_UNKNOWN = 0,
> CHECKSTOP_TYPE_CORE = 1,
> CHECKSTOP_TYPE_NX = 2,
> + CHECKSTOP_TYPE_NPU = 3
> };
>
> enum OpalHMI_CoreXstopReason {
>
More information about the Skiboot
mailing list