[Skiboot] [PATCH] external/pci-scripts: Add PHB error parsing script

Oliver O'Halloran oohall at gmail.com
Fri Jul 17 14:42:43 AEST 2020


A very hacky, but very useful script that parses the PowerNV EEH register dump
from the kernel log, and the verbose EEH dump from the opal message log
and renders it into something mostly readable.

Cc: Mahesh Salgaonkar <mahesh at linux.ibm.com>
Signed-off-by: Oliver O'Halloran <oohall at gmail.com>
---
 external/pci-scripts/phberr.py | 658 +++++++++++++++++++++++++++++++++
 1 file changed, 658 insertions(+)
 create mode 100755 external/pci-scripts/phberr.py

diff --git a/external/pci-scripts/phberr.py b/external/pci-scripts/phberr.py
new file mode 100755
index 000000000000..5f295fdc962b
--- /dev/null
+++ b/external/pci-scripts/phberr.py
@@ -0,0 +1,658 @@
+#!/usr/bin/env python3
+
+import sys
+import ppc
+import re
+
+# Mnemonic PHB_ESR - Address Offset 0x0C80 - phbErrorStatusRegister
+phb_esr_bits = [
+	(0, "ETU/RSB Request Address Error"),
+	(1, "Fundamental A Request Address Error"),
+	(2, "Fundamental A Request Size/Alignment Error"),
+	(3, "Fundamental A PCI CFG Addr/Size Error"),
+	(4, "Fundamental A IODA Table Access Error"),
+	(5, "Fundamental A Internal Registers Parity Error"),
+	(6, "PHB Error Registers Request Address Error"),
+	(7, "PHB Error Registers Request Size/Alignment Error"),
+	(8, "Fundamental B Request Address Error"),
+	(9, "Fundamental B Request Size/Alignment Error"),
+	(10, "Fundamental B Internal Registers Parity Error"),
+	(11, "Internal Bus Logic Bad PCIE Macro Request Address"),
+	(12, "Debug Request Address Error"),
+	(13, "Debug Request Size/Alignment Error"),
+	(14, "Debug Internal Registers Parity Error"),
+	(15, "Internal Bus Logic State Machine One-Hot Error"),
+	(16, "UV Page Request Address Error"),
+	(17, "UV Page Request Size/Alignment Error"),
+	(18, "UV Page Internal Registers Parity Error"),
+	(20, "RXE_ARB OR Error Status"),
+	(21, "RXE_MRG OR Error Status"),
+	(22, "RXE_TCE OR Error Status"),
+	(23, "TXE OR Error Status"),
+	(24, "pcie_etu_regb_err_inf"),
+	(25, "pcie_etu_regb_err_erc"),
+	(26, "pcie_etu_regb_err_fat"),
+	(27, "bus_regs_req_wr_data_p_e"),
+	(28, "SCOM HV Indirect Access Error"),
+	(29, "SCOM UV Indirect Access Error"),
+	(30, "SCOM Internal Registers Parity Error"),
+	(31, "SCOM Satellite Finite State Machine Error"),
+]
+
+# Mnemonic TXE_ESR  - Address Offset 0x0D00 - txeFirstErrorStatus
+txe_esr_bits = [
+	(0, "AIB Command Invalid"),
+	(2, "AIB Address Decode Error"),
+	(3, "AIB Size Invalid"),
+	(4, "AIB Cmd Ctrls Parity Error"),
+	(5, "AIB Data Ctrls Parity Error"),
+	(8, "AIB Alignment Error"),
+	(9, "AIB Cmd Bus Parity Error"),
+	(10, "AIB Data Bus UE ECC Error"),
+	(11, "AIB Data Ctrls Sequence Error"),
+	(12, "AIB Data Bus CE ECC Error"),
+	(13, "TCE Rd Response DAT_ERR Indication"),
+	(14, "AIB Command Credits Error"),
+	(15, "AIB Data Credits Error"),
+	(16, "BLIF Controls Parity Error"),
+	(17, "CFG Write Error CA or UR response"),
+	(18, "BLIF Forward Progress Timeout"),
+	(19, "MMIO RD Pending Error"),
+	(20, "MMIO WR Pending Error"),
+	(21, "MMIO CFG Pending Error"),
+	(22, "MMIO Write DAT_ERR Indication"),
+	(23, "CI Store Data Fifo Error"),
+	(24, "CFG Enable Error, RRB"),
+	(25, "CFG Size Error"),
+	(26, "CFG Bus Address Error"),
+	(27, "CFG Link Down Error"),
+	(28, "PAPR TXE Injection Error Triggered"),
+	(29, "CFG Write Request Timeout"),
+	(30, "PAPR TXE Injection Error Triggered"),
+	(36, "CI Trigger Buffer ECC Correctable Error"),
+	(37, "CI Trigger Buffer ECC Uncorrectable Error"),
+	(38, "CI Trigger Buffer Stage Data Parity Error"),
+	(40, "MMIO BAR Table (MBT) Parity Error"),
+	(42, "MMIO Domain Table (MDT) ECC Correctable Error"),
+	(43, "MMIO Domain Table (MDT) ECC Uncorrectable Error"),
+	(44, "MMIO Domain Table (MDT) Stage Parity Error"),
+	(45, "MMIO Domain Table (MDT) Stage Valid Error"),
+	(46, "AIB Data Special Uncorrectable Error (SUE)"),
+	(47, "MMIO Domain Table (MDT)"),
+	(48, "P2P Store Data Fifo Error"),
+	(49, "EPAT Table Parity Error"),
+	(50, "MMIO Cmd Parity Error"),
+	(51, "BLIF1 Reg Parity Error"),
+	(52, "P2P1 Reg Parity Error"),
+	(53, "P2P WR Pending Error"),
+	(54, "CRW Onehot Error"),
+	(55, "CRW Pending Error"),
+	(56, "RRB Parity Error"),
+	(57, "RRB Size/Alignment Error"),
+	(58, "s_bad_addr_e_q"),
+	(59, "s_req_size_align_e_q"),
+]
+
+# Mnemonic RXE_ARB_ESR - Address Offset 0x0D80 - phbRxeArbErrorStatus
+rxe_arb_bits = [
+	(0, "BLIF Inbound CA Completion Error"),
+	(1, "BLIF Inbound UR Completion Error"),
+	(2, "MSI Size Error"),
+	(3, "MSI Address Alignment Error"),
+	(5, "BLIF Inbound Header ECC Correctable (CE)"),
+	(6, "BLIF Inbound Header ECC Uncorrectable (UE)"),
+	(7, "ARB Stage Valid Error"),
+	(8, "TCE Tag Release Unused"),
+	(9, "TCE Tag Used, Not Free"),
+	(10, "ARB MMIO Buffer Overflow"),
+	(11, "ARB MMIO Buffer Underflow"),
+	(12, "ARB MMIO Internal Parity Error"),
+	(13, "ARB DMA Buffer Overflow"),
+	(14, "ARB DMA Buffer Underflow"),
+	(15, "ARB DMA Internal Parity Error"),
+	(16, "BLIF Header Control Bits Parity Error"),
+	(17, "BLIF Data Control Bits Parity Error"),
+	(18, "BLIF Unsupported Request (UR) Error"),
+	(19, "BLIF Completion Timeout Error"),
+	(20, "SEID Table ECC Correctable (CE)"),
+	(21, "SEID Table ECC Uncorrectable (UE)"),
+	(22, "NBW Size Error"),
+	(23, "DEC IODA Table Fatal Error"),
+	(24, "TLP Poisoned Error"),
+	(25, "MIST ECC Correctable Error"),
+	(26, "IODA TVT Entry Invalid"),
+	(27, "MSI PE# Mismatch"),
+	(28, "IODA TVT Address"),
+	(29, "TVT ECC Correctable Error"),
+	(30, "TVT ECC Uncorrectable Error"),
+	(31, "MIST ECC Uncorrectable Error"),
+	(32, "PELT-V BAR Disabled Error"),
+	(33, "IODA Table Parity Error"),
+	(34, "PCT Timeout"),
+	(35, "PCT Unexpected Completion"),
+	(36, "PCT Parity Error"),
+	(37, "DEC Stage Valid Error"),
+	(38, "DEC Stage Parity Error"),
+	(39, "PAPR Inbound Injection Error Triggered"),
+	(40, "DMA/MSI: RTE PE Number"),
+	(41, "RTT BAR Disabled Error"),
+	(42, "RTC Internal Parity Error"),
+	(43, "RTC Queue Overflow"),
+	(44, "RTC Queue Underflow"),
+	(45, "RTC Stage Valid Error"),
+	(46, "RTC RCAM Bad State Error"),
+	(47, "RTC RCAM Multiple Hit Error"),
+	(48, "RRB Parity Error"),
+	(49, "RRB request Size / Alignment Error"),
+	(50, "s_bad_addr_e_q"),
+	(51, "s_req_size_align_e_q"),
+	(54, "Discontiguous DMA Write Fragmentation"),
+	(55, "LIST Table Parity Error"),
+	(56, "LKP PEST Data Queue Error"),
+	(57, "PCIE Fatal Error Message Received"),
+	(58, "PCIE Nonfatal Error Message Received"),
+	(59, "PCIE Correctable Error Message Received"),
+]
+
+#Mnemonic RXE_MRG_ESR - Address Offset 0x0E00, phbRxeMrgErrorStatus
+rxe_mrg_bits = [
+	(8, "MRG TMB Allocation Error"),
+	(9, "MRG TMB Response Invalid"),
+	(10, "MRG TMB Response Ready Error"),
+	(11, "MRG MMIO Queue Overflow Error"),
+	(12, "MRG MMIO Queue Underflow Error"),
+	(13, "MRG MMIO Internal Parity Error"),
+	(14, "MRG DMA Queue Overflow Error"),
+	(15, "MRG DMA Queue Underflow Error"),
+	(16, "MRG DMA Internal Parity Error"),
+	(17, "MRG Migration Register Table"),
+	(18, "MRG Migration Register Table"),
+	(20, "s_bad_addr_e_q"),
+	(21, "s_req_size_align_e_q"),
+	(22, "RRB Parity Error"),
+	(23, "RRB request Size / Alignment Error"),
+	(24, "DSP AIB TX Timeout Error"),
+	(25, "Reserved (vA4.1)"),
+	(26, "DSP AIB TX CMD Credit Parity Error"),
+	(28, "DSP AIB TX DAT Credit Parity Error"),
+	(30, "DSP Command Credit Overflow Error"),
+	(31, "DSP Command Credit Underflow Error"),
+	(32, "DSP Command Credit Parity Error"),
+	(33, "DSP Data Credit Overflow Error"),
+	(34, "DSP Data Credit Underflow Error"),
+	(35, "DSP Data Credit Parity Error"),
+	(36, "DSP Completion State Machine One-Hot Error"),
+	(37, "DSP Write Thread State Machine One-Hot Error"),
+	(38, "DSP DMA Secure Address Error (vA4.2)"),
+	(39, "DSP MSI Interrupt Notification Secure Address"),
+	(40, "DSP TREQ ECC Correctable Error"),
+	(41, "DSP TREQ ECC Uncorrectable Error"),
+	(42, "DSP MMIO Queue Overflow Error"),
+	(43, "DSP MMIO Queue Underflow Error"),
+	(44, "DSP MMIO Internal Parity Error"),
+	(45, "DSP DMA Queue Overflow Error"),
+	(46, "DSP DMA Queue Underflow Error"),
+	(47, "DSP DMA Internal Parity Error"),
+	(48, "DSP Read Thread State Machine One-Hot Error"),
+	(49, "DSP Table State Machine One-Hot Error"),
+	(50, "DSP NBW State Machine One-Hot Error"),
+	(51, "DSP TSM PEST BAR Disabled Error"),
+	(56, "IPD ECC Correctable Error"),
+	(57, "IPD ECC Uncorrectable Error"),
+	(58, "ICPLD ECC Correctable Error"),
+	(59, "ICPLD ECC Uncorrectable Error"),
+	(60, "NBWD ECC Correctable Error"),
+	(61, "NBWD ECC Uncorrectable Error"),
+	(63, "pb_etu_ai_rx_raise_fence"),
+]
+
+
+# Mnemonic RXE_TCE_ESR -  Address Offset 0x0E80 - phbRxeTceErrorStatus
+rxe_tce_bits = [
+	(0, "TCE CMP Internal Parity Error"),
+	(1, "TCE Request Page Access Error"),
+	(2, "TCE Response Page Access Error"),
+	(3, "TCE CMP Queue Overflow"),
+	(4, "TCE CMP Queue Underflow"),
+	(5, "TCE Secure Address Error"),
+	(6, "TCE Cache Bad State Error"),
+	(7, "TCE Cache Multi-Way Hit Error"),
+	(8, "TCE Request Timeout Error"),
+	(9, "TCE TCR ECC Correctable Error"),
+	(10, "TCE TCR ECC Uncorrectable Error"),
+	(11, "TCE TDR ECC Correctable Error"),
+	(12, "TCE TDR ECC Uncorrectable Error"),
+	(13, "TCE Unexpected Response Error"),
+	(14, "RRB Parity Error"),
+	(15, "RRB request Size / Alignment Error"),
+	(16, "TCE RES Internal Parity Error"),
+	(17, "s_bad_addr_e_q"),
+	(18, "s_req_size_align_e_q"),
+	(19, "TCE RES Queue Overflow"),
+	(20, "TCE RES Queue Underflow"),
+	(21, "TCE Response Data Parity Error"),
+	(22, "TCE TCLB CAM Bad State Error"),
+	(23, "TCE TCLB CAM Multi-Hit Error"),
+	(24, "TCE Kill Internal Parity Error"),
+	(25, "TCE THASH Array ECC Correctable Error"),
+	(26, "TCE THASH Array ECC Uncorrectable Error"),
+	(27, "TCE TCLB TDAT ECC Correctable Error"),
+	(28, "TCE TCLB TDAT ECC Uncorrectable Error"),
+	(29, "TCE Kill State Machine One-Hot Error"),
+	(30, "TCE Kill Queue Overflow"),
+	(31, "TCE Kill Queue Underflow"),
+	(32, "TCE Request Secure Address Register"),
+	(33, "TCE Response Secure Address Register"),
+]
+
+
+#Mnemonic PBL_ESR  - Address Offset 0x1900 - phbPblErrorStatus
+pbl_esr_bits = [
+	(0, "pb_err_p_fe_tlif_rx_par_e Parity error detected on TLIF Receive interface."),
+	(1, "pb_err_p_fe_tlif_tx_par_e Parity error detected on TLIF Transmit interface."),
+	(2, "pb_err_p_fe_blif_out_par_e"),
+	(3, "pb_err_p_fe_blif_in_par_e"),
+	(4, "pb_err_p_fe_int_par_e"),
+	(5, "pb_err_p_fe_toc_cred_e"),
+	(6, "pb_err_p_fe_ocf_par_e"),
+	(7, "pb_err_p_fe_ocf_prot_e"),
+	(12, "pb_err_p_fe_pct_erq_overflow_e"),
+	(13, "pb_err_p_fe_pct_erq_underflow_e"),
+	(14, "pb_err_p_fe_pct_onp_tags_rls_unused_e"),
+	(15, "pb_err_p_fe_pct_onp_tags_used_notfree_e"),
+	(16, "pb_err_p_fe_pct_onp_tags_used_unexp_e"),
+	(17, "pb_err_p_fe_bct_onp_tags_rls_unused_e"),
+	(18, "pb_err_p_fe_bct_onp_tags_used_notfree_e"),
+	(19, "pb_err_p_fe_ib_bct_rd_inv"),
+	(20, "pb_err_p_fe_ob_buffer_overflow_e"),
+	(21, "pb_err_p_fe_ob_buffer_underflow_e"),
+	(22, "pb_err_p_fe_ib_buffer_overflow_e"),
+	(23, "pb_err_p_fe_ib_buffer_underflow_e"),
+	(24, "pb_err_p_fe_ib_d_ecc_ue"),
+	(25, "pb_err_p_fe_ib_h_ecc_ue"),
+	(26, "pb_err_p_fe_ob_d_ecc_ue"),
+	(27, "pb_err_p_fe_ob_h_ecc_ue"),
+	(28, "pb_err_p_fe_ocf_ecc_ue"),
+	(32, "pb_err_p_fe_tx_pst_discard_e"),
+	(33, "pb_err_p_inf_tx_npst_discard_e"),
+	(34, "pb_err_p_fe_nbw_tlp_e"),
+	(36, "pb_err_p_fe_pci_rcv_cpl_ca_e"),
+	(37, "pb_err_p_fe_pci_rcv_cpl_crs_e"),
+	(38, "pb_err_p_fe_pci_rcv_cpl_rsvd_e"),
+	(39, "pb_err_p_fe_pci_rcv_cpl_ur_e"),
+	(40, "pb_err_p_fe_pci_rcv_ecrc_e"),
+	(41, "pb_err_p_fe_pci_rcv_malf_tlp_e"),
+	(42, "pb_err_p_fe_pci_rcv_overflow_e"),
+	(43, "pb_err_p_fe_pci_rcv_poisoned_tlp_e"),
+	(44, "pb_err_p_fe_pci_rcv_unexp_cpl_e"),
+	(45, "pb_err_p_fe_pci_rcv_unsup_req_e"),
+	(46, "pb_err_p_fe_pci_sig_cpl_abort_e"),
+	(47, "pb_err_p_fe_pci_sig_cpl_timeout_e"),
+	(48, "pb_err_p_fe_pci_sig_poisoned_tlp_e"),
+	(52, "pb_err_p_inf_out_trans_to_pst_e"),
+	(53, "pb_err_p_inf_out_trans_to_npst_e"),
+	(54, "pb_err_p_inf_out_trans_to_cpl_e"),
+	(56, "pb_err_p_inf_ib_d_ecc_ce"),
+	(57, "pb_err_p_inf_ib_h_ecc_ce"),
+	(58, "pb_err_p_inf_ob_d_ecc_ce"),
+	(59, "pb_err_p_inf_ob_h_ecc_ce"),
+	(60, "pb_err_p_inf_ocf_ecc_ce"),
+	(62, "PBL Bad Register Address Error"),
+	(63, "PBL Register Parity Error"),
+]
+
+# Mnemonic REGB_ESR - Address Offset 0x1C00 - phbRegbErrorStatus
+regb_esr_bits = [
+	(0, "REGB Internal Register Parity Error"),
+	(1, "PBL Internal Register Parity Error"),
+	(2, "Invalid Address Decode Error"),
+	(3, "Register Access Invalid Address+Size Error"),
+	(5, "Register State Machine or Other Internal Error"),
+	(6, "PCI CFG Core Registers Parity Error"),
+	(7, "Register access to CFG core while in reset error."),
+	(8, "PCIE Link Down"),
+	(9, "PCIE Link Up"),
+	(10, "PCIE Link Auto Bandwidth Event Status"),
+	(11, "PCIE Link BW Management Event Status"),
+	(25, "PBL Error Trap: INF Error"),
+	(26, "PBL Error Trap: ERC Error"),
+	(27, "PBL Error Trap: FAT Error"),
+	(28, "tldlpo_dl_mon_rxreceivererror(0)"),
+	(29, "tldlpo_dl_mon_rxreceivererror(1)"),
+	(30, "tldlpo_dl_mon_rxreceivererror(2)"),
+	(32, "DL_EC08_BADDLLP"),
+	(33, "DL_EC08_BADTLP"),
+	(34, "DL_EC08_DLLPE"),
+	(35, "DL_EC08_RECEIVERERROR"),
+	(36, "DL_EC08_ REPLAYROLLOVER"),
+	(37, "DL_EC08_REPLAYTIMEOUT"),
+	(39, "DL_INTERNALERROR"),
+	(40, "DL_LB_ERROR"),
+	(41, "DL_RX_MALFORMED"),
+	(42, "DL_RX_NULLIFY"),
+	(43, "DL_RX_OVERFLOW"),
+	(44, "DL_TX_CORRERROR"),
+	(45, "DL_TX_UNCORRERROR"),
+	(46, "TL_EC08_FCPE"),
+	(48, "Replay ECC Correctable Error (CE)"),
+	(49, "Replay ECC UnCorrectable Error (UE)"),
+	(50, "Bad DLLP Error Count Saturated"),
+	(51, "Bad TLP Error Count Saturated"),
+	(52, "Receiver Error Count Saturated"),
+	(53, "DLLPE Error Count Saturated"),
+	(58, "pbl_ptl_dl_al_rx_initcredit_p_e"),
+	(59, "pbl_ptl_dl_al_rx_updatecredit_p_e"),
+	(60, "PTL Core DLIF Protocol Error"),
+	(61, "PTL Core TLIF Protocol Error"),
+	(62, "PTL Core Internal Parity Error"),
+]
+
+# FIXME: use the long desc
+nfir_bits = [
+	(0, "bar_pe"), # One of the BARs or BAR Mask Register parity error.
+	(1, "nonbar_pe"), # Any non-BAR parity error.
+	(2, "PB_to_PEC_ce"), # ECC correctable error off of outbound SMP interconnect.
+	(3, "PB_to_PEC_ue"), # ECC uncorrectable error off of outbound SMP interconnect.
+	(4, "PB_to_PEC_sue"), # ECC special uncorrectable error off of outbound SMP interconnect
+	(5, "ary_ecc_ce"), # ECC correctable error on an internal array.
+	(6, "ary_ecc_ue"), # ECC uncorrectable error on an internal array.
+	(7, "ary_ecc_sue"), # ECC special uncorrectable error on an internal array.
+	(8, "register_array_pe"), # Parity error on an internal register file.
+	(9, "pb_interface_pe"), # Parity error on the PB interface (address/aTag/tTag/rTAG).
+	(10, "pb_data_hang_errors"), # Any SMP interconnect data hang poll error (only checked for CI stores).
+	(11, "pb_hang_errors"), # Any SMP interconnect command hang error (domestic address range).
+	(12, "rd_are_errors"), # SMP interconnect address error (ARE) detected by a DMA read.
+	(13, "nonrd_are_errors"), # SMP interconnect address error detected by a DMA write or an interrupt engine.
+	(14, "pci_hang_error"), # PBCQ detected that the PCI load, store, EOI, or DMA read response did not make forward progress.
+	(15, "pci_clock_error"), # PBCQ has detected that the PCI clock has stopped.
+	(16, "PFIR_freeze"), # This is the freeze signal from the PFIR freeze output.
+	(17, "hw_errors"), # Any miscellaneous hardware error.
+	(18, "UnsolicitiedPBData"), # The PEC received data with an rTAG matching a queue that was not expecting data or too much data was received.
+	(19, "UnExpectedCResp"), # PEC received an unexpected combined response.
+	(20, "InvalidCResp"), # PEC received an invalid combined response.
+	(21, "PBUnsupportedSize"), # PEC received a CI load/store that hits a BAR but is an unsupported size or address alignment.
+]
+
+pfir_bits = [
+	(0, "register_pe"), # PBAIB register parity error.
+	(1, "hardware_error"), # Hardware error.
+	(2, "AIB_intf_error"), # AIB interface error.
+	(3, "ETU_Reset_error"), # ETU reset error.
+	(4, "PEC_scom_error"), # Common PEC SCOM error.
+	(5, "scomfir_error0"), # SCOM Error bit 0
+	(6, "scomfir_error1"), # SCOM Error bit 1
+]
+
+class PHBError:
+    reg_bits = {
+        "NEST FIR": nfir_bits,
+        "PCI FIR": pfir_bits,
+        "phbErrorStatus": phb_esr_bits,
+        "phbTxeErrorStatus": txe_esr_bits,
+        "phbRxeArbErrorStatus": rxe_arb_bits,
+        "phbRxeMrgErrorStatus": rxe_mrg_bits,
+        "phbRxeTceErrorStatus": rxe_tce_bits,
+        "phbRegbErrorStatus": regb_esr_bits,
+        "phbPblErrorStatus": pbl_esr_bits,
+    }
+
+    def __str__(self):
+        s = ""
+        for k, v in self.regs.items():
+            s += "{:30s} - {:#018x} - {}\n".format(k, v, ppc.setbits(v))
+        return s
+
+    def __init__(self, timestamp = 0):
+        self.timestamp = timestamp
+        self.pest = []
+        self.regs = {}
+
+    # NB: Value is a str, FIXME: Work out how to use python's type annotations
+    def set_reg(self, reg, value):
+        reg = reg.replace(" ", "")
+        if not self.regs.get(reg):
+            self.regs[reg] = value
+            return True
+        return False
+
+    def get_reg(self, reg):
+        reg = reg.replace(" ", "")
+        v = self.regs.get(reg)
+        if v:
+            return v
+        return 0
+
+    # NB: pest entries should be inserted in sort order, but it might be a good
+    # idea to explicitly sort them by PE number
+    def set_pest(self, pe, pesta, pestb):
+        self.pest.append((pe, pesta, pestb))
+
+    def get_pest(self, pe_number):
+        for pe, a, b in self.pest:
+            if pe == pe_number:
+                return (a, b)
+        return None
+
+    def header(self):
+        return self.timestamp
+
+    # TODO: move the formatting out of here and into the main loop
+    def show_errs(self):
+        out = ""
+        for reg_name,reg_bits in self.reg_bits.items():
+            reg_value = self.get_reg(reg_name)
+            parts = reg_name.split("Error");
+            if len(parts) > 1:
+                first_name = "{:s}FirstError{:s}".format(parts[0], parts[1])
+                first_value = self.get_reg(first_name)
+
+                # skiboot spells it wrong, so check Frst too
+                if first_value == 0:
+                    frst_name = "{:s}FrstError{:s}".format(parts[0], parts[1])
+                    first_value = self.get_reg(frst_name)
+            else:
+                first_value = 0
+
+            if reg_value == 0:
+                continue
+            out += "{} = {:016x}:\n".format(reg_name, reg_value);
+
+            for bit in reg_bits:
+                if ppc.ppcbit(bit[0]) & reg_value:
+                    bang = "!" if (ppc.ppcbit(bit[0]) & reg_value & first_value) == ppc.ppcbit(bit[0]) else ""
+                    out += "{:s}\t{:2d} - {}\n".format(bang, bit[0], bit[1])
+            out += "\n"
+
+        if len(self.pest) == 0:
+            return out
+
+        out += "PEST entries:\n"
+        for pe, pesta, pestb in self.pest:
+            out += "\tPEST[{:03x}] = {:016x} {:016x}\n".format(pe, pesta, pestb)
+
+        return out
+
+
+
+def parse_opal_log(log_text):
+    # Patterns to match:
+    #
+    # [  938.249526636,3] PHB#0030[8:0]:        NEST FIR WOF=0000800000000000
+    # [  938.250657886,3] PHB#0030[8:0]:               slotStatus = 00402000
+    # [  938.254305278,3] PHB#0030[8:0]:                PEST[511] = 3740002a01000000 0000000000000000
+    #
+    phblog_re = re.compile("" +
+        "^\[\s*[\d.,]+] " +           # skiboot log header
+        "(PHB#....\[.:.]):" +       # PHB name
+        "\s+" +                     # whitespace between the PHB and register name
+        "([^:=]+)" +                 # register name, NB: this might have some trailing WS
+        "=\s*" +                 # the '=' seperating name and value, along with the whitespace
+        "([a-fA-F\d ]+)")           # register value(s)
+
+    # this alone isn't really sufficent. There's a few cases that can cause a register
+    # dump to be generated (e.g. when the link is retrained we do a reg dump)
+    new_log_marker = re.compile("" +
+        "^\[ [\d.,]+] " +
+        "(PHB#....\[.:.]): " +
+        "PHB Freeze/Fence detected !")
+
+    # Store the current register set for each PHB. Keep in mind that we can have register
+    # dumps from different PHBs being interleaved in the register log.
+    current = {}
+
+    # list discovered error logs
+    error_logs = []
+
+    # Match things and split them on a per-PHB basis. We can get multiple PHB error logs
+    # printed interleaved in the skiboot log if there are multiple PHBs frozen.
+    for l in log_text.split("\n"):
+        m = new_log_marker.match(l)
+        if not m:
+            m = phblog_re.match(l)
+        if not m:
+            continue
+
+        match = m.groups()
+        phb = match[0]
+
+        # new log marker, save the current log and create a new one to store register values in
+        log = current.get(phb)
+        if not log:
+            current[phb] = PHBError(l);
+        elif len(match) == 1:
+            error_logs.append(current[phb])
+            current[phb] = PHBError(l) # create a new log object
+            log = current[phb]
+
+        if len(match) > 1:
+            if match[1].find("PEST") >= 0: # PEST entry
+                # NB: unlike .match() .search() scans the whole string
+                m = re.search("PEST\[([\da-fA-F]+)] = ([\da-fA-F]+) ([\da-fA-F]+)", l)
+                pe, pesta, pestb = [int(i, 16) for i in m.groups()]
+                current[phb].set_pest(pe, pesta, pestb)
+            else: # Normal register
+                name = match[1].strip()
+                value = int(match[2].strip(), 16)
+
+                ok = current[phb].set_reg(name, value)
+
+                # If we have duplicate registers then we're in a new log context
+                # so stash the current one and init a new one.
+                if not ok:
+                    error_logs.append(current[phb])
+                    current[phb] = PHBError(l)
+                    current[phb].set_reg(name, value)
+
+    # save all the logs we're still processing
+    for k,v in current.items():
+        error_logs.append(v)
+
+    return error_logs
+
+
+'''
+Mar 25 10:01:49 localhost kernel: PHB4 PHB#48 Diag-data (Version: 1)
+Mar 25 10:01:49 localhost kernel: brdgCtl:    00000002
+Mar 25 10:01:49 localhost kernel: RootSts:    00010020 00402000 a1030008 00100107 00002000
+Mar 25 10:01:49 localhost kernel: RootErrSts: 00000000 00000000 00000001
+Mar 25 10:01:49 localhost kernel: PhbSts:     0000001c00000000 0000001c00000000
+Mar 25 10:01:49 localhost kernel: Lem:        0000000100280000 0000000000000000 0000000100000000
+Mar 25 10:01:49 localhost kernel: PhbErr:     0000088000000000 0000008000000000 2148000098000240 a008400000000000
+Mar 25 10:01:49 localhost kernel: RxeArbErr:  4000200000000000 0000200000000000 02409fde30000000 0000000000000000
+Mar 25 10:01:49 localhost kernel: PblErr:     0000000001000000 0000000001000000 0000000000000000 0000000000000000
+Mar 25 10:01:49 localhost kernel: PcieDlp:    0000000000000000 0000000000000000 ffff000000000000
+Mar 25 10:01:49 localhost kernel: RegbErr:    0000004a10000800 0000000810000000 8800003c00000000 0000000007011000
+Mar 25 10:01:49 localhost kernel: PE[1fd] A/B: a440002a05000000 8000000000000000
+'''
+
+def parse_kernel_log(log_text):
+    reg8  = "([0-9a-fA-F]{8})"
+    reg16 = "([0-9a-fA-F]{16})"
+
+    # TODO: pick up the AER stuff the kernel logs too?
+    # NB: The register names used for set_reg are the skiboot register names, not the kernel.
+    # TODO: check these for completeness / accuracy. I might have missed something
+    register_patterns = [
+        (re.compile("brdgCtl:    {}"            .format(reg8)), "brdgCtl"),
+        (re.compile("RootSts:    {} {} {} {} {}".format(reg8, reg8, reg8, reg8, reg8)),
+                    'deviceStatus', 'slotStatus', 'linkStatus', 'devCmdStatus', 'devSecStatus'),
+        (re.compile("RootErrSts: {} {} {}"      .format(reg8, reg8, reg8)),
+                    'rootErrorStatus', 'uncorrErrorStatus', 'corrErrorStatus'),
+        (re.compile("PhbSts:     {} {}"         .format(reg16, reg16)), "phbPlssr", "phbCsr"),
+        (re.compile("nFir:       {} {} {}"      .format(reg16, reg16, reg16)), "nFir", "nFirMask", "nFirWOF"),
+        (re.compile("Lem:        {} {} {}"      .format(reg16, reg16, reg16)), "lemFir", "lemErrorMask", "lemWOF"),
+        (re.compile("PhbErr:     {} {} {} {}"   .format(reg16, reg16, reg16, reg16)),
+                    "phbErrorStatus", "phbFirstErrorStatus", "phbErrorLog0", "phbErrorLog1"),
+        (re.compile("PhbTxeErr:  {} {} {} {}"    .format(reg16, reg16, reg16, reg16)),
+                    "phbPhbTxeErrorStatus", "phbPhbTxeFirstErrorStatus", "phbPhbTxeErrorLog0", "phbTxeErrorLog1"),
+        (re.compile("RxeArbErr:  {} {} {} {}"    .format(reg16, reg16, reg16, reg16)),
+                    "phbRxeArbErrorStatus", "phbRxeArbFirstErrorStatus", "phbRxeArbErrorLog0", "phbRxeArbErrorLog1"),
+        (re.compile("RxeMrgErr:  {} {} {} {}"    .format(reg16, reg16, reg16, reg16)),
+                    "phbRxeMrgErrorStatus", "phbRxeMrgFirstErrorStatus", "phbRxeMrgErrorLog0", "phbRxeMrgErrorLog1"),
+        (re.compile("RxeTceErr:  {} {} {} {}"    .format(reg16, reg16, reg16, reg16)),
+                    "phbRxeTceErrorStatus", "phbRxeTceFirstErrorStatus", "phbRxeTceErrorLog0", "phbRxeTceErrorLog1"),
+        (re.compile("PblErr:     {} {} {} {}"    .format(reg16, reg16, reg16, reg16)),
+                    "phbPblErrorStatus", "phbPblFirstErrorStatus", "phbPblErrorLog0", "phbPblErrorLog1"),
+        (re.compile("PcieDlp:    {} {} {}"       .format(reg16, reg16, reg16)),
+                    "phbPcieDlpErrorLog1", "phbPcieDlpErrorLog2", "phbPcieDlpErrorStatus"),
+        (re.compile("RegbErr:    {} {} {} {}"    .format(reg16, reg16, reg16, reg16)),
+                    "phbRegbErrorStatus", "phbRegbFirstErrorStatus", "phbRegbErrorLog0", "phbRegbErrorLog1"),
+    ]
+
+    header_pattern = re.compile("PHB4 PHB#[0-9]+ Diag-data") # match header
+    pe_pattern = re.compile("PE\[{}\] A/B: {} {}".format("([ 0-9a-fA-F]{3})", reg16, reg16)) # the PE number is three hex digits
+
+    logs = []
+    log = PHBError("");
+
+    # pretty nasty but since interpreting the kernel logs requires context I
+    # don't have any better ideas
+    for l in log_text.split("\n"):
+        m = header_pattern.search(l)
+        if m: # start a new log
+            logs.append(log)
+            log = PHBError(l)
+            continue
+
+        for p,*names in register_patterns:
+            m = p.search(l)
+            if not m:
+                continue
+            for name, val in zip(names, m.groups()):
+                log.set_reg(name, int(val, 16))
+            break
+
+        m = pe_pattern.search(l)
+        if m:
+            pe = int(m.groups()[0], 16)
+            pesta = int(m.groups()[1], 16)
+            pestb = int(m.groups()[2], 16)
+            log.set_pest(pe, pesta, pestb)
+
+    logs.append(log)
+
+    return logs
+
+def main(argv):
+    if len(argv) < 2:
+        print("Usage: {} <log file>".format(argv[0]));
+        return
+
+    try:
+        log_text = open(argv[1]).read();
+    except Exception as err:
+        print(err)
+        sys.exit(1)
+
+    logs = parse_opal_log(log_text);
+    logs.extend(parse_kernel_log(log_text))
+
+    for err in logs:
+        print("==== PHB Register dump found ====")
+        print("")
+        print(err.header())
+        print("")
+        print(err.show_errs())
+
+if __name__ == "__main__":
+    main(sys.argv)
-- 
2.26.2



More information about the Skiboot mailing list