[Skiboot] [PATCH V2 3/4] nvlink: Add freeze and fence error injection
Alistair Popple
alistair at popple.id.au
Wed Jan 20 15:39:44 AEDT 2016
This patch along with the others seem to work well:
[ 159.531734] EEH: Frozen PE#1 on PHB#8 detected
[ 159.531947] EEH: PE location: N/A, PHB location: N/A
[ 159.532142] nvidia-nvlink: IBMNPU: ibmnpu_pci_error_detected device 0008:00:00.0
[ 159.532299] nvidia-nvlink: IBMNPU: ibmnpu_pci_error_detected device 0008:00:00.1
[ 166.340741] nvidia-nvlink: Failed to register NPU device : -7
Now we just need to teach the nvlink driver how to handle these.
Acked-By: Alistair Popple <alistair at popple.id.au>
On Mon, 18 Jan 2016 16:59:41 Russell Currey wrote:
> Enable NPU freeze and fence injection through debugfs.
>
> For example, if a NPU is PCI bus 8, a freeze on PE 1 can be injected with:
>
> echo 1:0:0:0:0 >> /sys/kernel/debug/powerpc/PCI0008/err_injct
>
> or a fence on PE 2 on PCI bus 9 with:
>
> echo 2:1:0:0:0 >> /sys/kernel/debug/powerpc/PCI0009/err_injct
>
> These will cause the appropriate EEH event to occur upon a DMA to the
> NVLink.
>
> PE number was added to the npu_dev struct to enable this.
>
> Signed-off-by: Russell Currey <ruscur at russell.cc>
> ---
> V2: Return OPAL_PARAMETER on failures and find devices based on pe_num
> ---
> hw/npu.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
> include/npu.h | 2 ++
> 2 files changed, 45 insertions(+), 1 deletion(-)
>
> diff --git a/hw/npu.c b/hw/npu.c
> index 3c2c0b8..23facaf 100644
> --- a/hw/npu.c
> +++ b/hw/npu.c
> @@ -954,6 +954,7 @@ static int64_t npu_set_pe(struct phb *phb,
> return OPAL_PARAMETER;
>
> link_idx = dev->index;
> + dev->pe_num = pe_num;
>
> /* Separate links will be mapped to different PEs */
> if (bcompare != OpalPciBusAll ||
> @@ -1020,6 +1021,47 @@ static int64_t npu_freeze_status(struct phb *phb,
> return OPAL_SUCCESS;
> }
>
> +/* Sets the NPU to trigger an error when a DMA occurs */
> +static int64_t npu_err_inject(struct phb *phb, uint32_t pe_num,
> + uint32_t type, uint32_t func __unused,
> + uint64_t addr __unused, uint64_t mask __unused)
> +{
> + struct npu *p = phb_to_npu(phb);
> + struct npu_dev *dev = NULL;
> + int i;
> +
> + if (pe_num > NPU_NUM_OF_PES) {
> + prlog(PR_ERR, "NPU: error injection failed, bad PE given\n");
> + return OPAL_PARAMETER;
> + }
> +
> + for (i = 0; i < p->total_devices; i++) {
> + if (p->devices[i].pe_num == pe_num) {
> + dev = &p->devices[i];
> + break;
> + }
> + }
> +
> + if (!dev) {
> + prlog(PR_ERR, "NPU: couldn't find device with PE %x\n", pe_num);
> + return OPAL_PARAMETER;
> + }
> +
> + /* TODO: extend this to conform to OPAL injection standards */
> + if (type > 1) {
> + prlog(PR_ERR, "NPU: invalid error injection type\n");
> + return OPAL_PARAMETER;
> + } else if (type == 1) {
> + /* Emulate fence mode. */
> + p->fenced = true;
> + } else {
> + /* Cause a freeze with an invalid MMIO write. */
> + in_be64((void *)dev->bar.base);
> + }
> +
> + return OPAL_SUCCESS;
> +}
> +
> static const struct phb_ops npu_ops = {
> .lock = npu_lock,
> .unlock = npu_unlock,
> @@ -1059,7 +1101,7 @@ static const struct phb_ops npu_ops = {
> .eeh_freeze_clear = NULL,
> .eeh_freeze_set = NULL,
> .next_error = NULL,
> - .err_inject = NULL,
> + .err_inject = npu_err_inject,
> .get_diag_data = NULL,
> .get_diag_data2 = NULL,
> .set_capi_mode = NULL,
> diff --git a/include/npu.h b/include/npu.h
> index 5d5135b..389b732 100644
> --- a/include/npu.h
> +++ b/include/npu.h
> @@ -144,6 +144,8 @@ struct npu_dev {
> unsigned long procedure_tb;
>
> uint32_t procedure_status;
> +
> + uint8_t pe_num;
> };
>
> /* NPU PHB descriptor */
>
More information about the Skiboot
mailing list