[Skiboot] [PATCH 2/2] npu2.c: Add PE error detection
Alistair Popple
alistair at popple.id.au
Thu Jan 11 15:28:51 AEDT 2018
Invalid accesses from the GPU can cause a specific PE to be frozen by the
NPU. Add an interrupt handler which reports the frozen PE to the operating
system via as an EEH event.
Signed-off-by: Alistair Popple <alistair at popple.id.au>
---
hw/npu2.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++--
include/npu2-regs.h | 17 +----------------
2 files changed, 54 insertions(+), 18 deletions(-)
diff --git a/hw/npu2.c b/hw/npu2.c
index 6658ab50..c88394b6 100644
--- a/hw/npu2.c
+++ b/hw/npu2.c
@@ -1213,6 +1213,35 @@ static int64_t npu2_freeze_status(struct phb *phb __unused,
return OPAL_SUCCESS;
}
+static int64_t npu2_eeh_next_error(struct phb *phb,
+ uint64_t *first_frozen_pe,
+ uint16_t *pci_error_type,
+ uint16_t *severity)
+{
+ struct npu2 *p = phb_to_npu2(phb);
+ int i;
+ uint64_t result = 0;
+
+ if (!first_frozen_pe || !pci_error_type || !severity)
+ return OPAL_PARAMETER;
+
+ *first_frozen_pe = -1;
+ *pci_error_type = OPAL_EEH_NO_ERROR;
+ *severity = OPAL_EEH_SEV_NO_ERROR;
+
+ for (i = 0; i < NPU2_MAX_PE_NUM; i++) {
+ result = npu2_read(p, NPU2_MISC_PESTB(i));
+ if (result > 0) {
+ *first_frozen_pe = i;
+ *pci_error_type = OPAL_EEH_PE_ERROR;
+ *severity = OPAL_EEH_SEV_PE_ER;
+ break;
+ }
+ }
+
+ return OPAL_SUCCESS;
+}
+
static int64_t npu2_tce_kill(struct phb *phb, uint32_t kill_type,
uint64_t pe_number, uint32_t tce_size,
uint64_t dma_addr, uint32_t npages)
@@ -1281,7 +1310,7 @@ static const struct phb_ops npu_ops = {
.eeh_freeze_status = npu2_freeze_status,
.eeh_freeze_clear = NULL,
.eeh_freeze_set = NULL,
- .next_error = NULL,
+ .next_error = npu2_eeh_next_error,
.err_inject = NULL,
.get_diag_data = NULL,
.get_diag_data2 = NULL,
@@ -1814,7 +1843,14 @@ static void npu2_add_phb_properties(struct npu2 *p)
static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn __unused)
{
- return IRQ_ATTR_TARGET_LINUX;
+ struct npu2 *p = is->data;
+ uint32_t idx = isn - p->base_lsi;
+
+ if (idx == 18)
+ /* TCE Interrupt - used to detect a frozen PE */
+ return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE;
+ else
+ return IRQ_ATTR_TARGET_LINUX;
}
static char *npu2_ipi_name(struct irq_source *is, uint32_t isn)
@@ -1852,7 +1888,22 @@ static char *npu2_ipi_name(struct irq_source *is, uint32_t isn)
return strdup(name);
}
+static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
+{
+ struct npu2 *p = is->data;
+ uint32_t idx = isn - p->base_lsi;
+
+ if (idx != 18) {
+ prerror("OPAL received unknown NPU2 interrupt %d\n", idx);
+ return;
+ }
+
+ opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+ OPAL_EVENT_PCI_ERROR);
+}
+
static const struct irq_source_ops npu2_ipi_ops = {
+ .interrupt = npu2_err_interrupt,
.attributes = npu2_ipi_attributes,
.name = npu2_ipi_name,
};
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index fdaad192..e739ac50 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -342,22 +342,7 @@ void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask);
#define NPU2_MISC_BRICK_BDF2PE_MAP_ENABLE PPC_BIT(0)
#define NPU2_MISC_BRICK_BDF2PE_MAP_PE PPC_BITMASK(4,7)
#define NPU2_MISC_BRICK_BDF2PE_MAP_BDF PPC_BITMASK(8,23)
-#define NPU2_MISC_PESTB00 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x200)
-#define NPU2_MISC_PESTB01 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x208)
-#define NPU2_MISC_PESTB02 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x210)
-#define NPU2_MISC_PESTB03 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x218)
-#define NPU2_MISC_PESTB04 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x220)
-#define NPU2_MISC_PESTB05 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x228)
-#define NPU2_MISC_PESTB06 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x230)
-#define NPU2_MISC_PESTB07 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x238)
-#define NPU2_MISC_PESTB08 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x240)
-#define NPU2_MISC_PESTB09 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x248)
-#define NPU2_MISC_PESTB10 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x250)
-#define NPU2_MISC_PESTB11 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x258)
-#define NPU2_MISC_PESTB12 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x260)
-#define NPU2_MISC_PESTB13 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x268)
-#define NPU2_MISC_PESTB14 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x270)
-#define NPU2_MISC_PESTB15 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x278)
+#define NPU2_MISC_PESTB(pe) NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x200 + (pe)*8)
#define NPU2_MISC_IRQ_LOG0 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x300)
#define NPU2_MISC_IRQ_LOG01 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x308)
#define NPU2_MISC_IRQ_LOG02 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x310)
--
2.11.0
More information about the Skiboot
mailing list