[Skiboot] [PATCH v3] phb4: Check for RX errors after link training

Michael Neuling mikey at neuling.org
Tue Oct 30 11:02:30 AEDT 2018


From: Oliver O'Halloran <oohall at gmail.com>

Some PHB4 PHYs can get stuck in a bad state where they are constantly
retraining the link. This happens transparently to skiboot and Linux
but will causes PCIe to be slow. Resetting the PHB4 clears the
problem.

We can detect this case by looking at the RX errors count where we
check for link stability. This patch does this by modifying the link
optimal code to check for RX errors. If errors are occurring we
retrain the link irrespective of the chip rev or card.

Normally when this problem occurs, the RX error count is maxed out at
255. When there is no problem, the count is 0. We chose 8 as the max
rx errors value to give us some margin for a few errors. There is also
a knob that can be used to set the error threshold for when we should
retrain the link. ie

  nvram -p ibm,skiboot --update-config phb-rx-err-max=8

Signed-off-by: Oliver O'Halloran <oohall at gmail.com>
Signed-off-by: Michael Neuling <mikey at neuling.org>

---
v3:
  - Minor printing change.
  - Handle int vs uint_8 conversion properly

v2:
  - Print info about which lane caused the error.
  - No functional change.
---
 hw/phb4.c           | 29 ++++++++++++++++++++++++++---
 include/phb4-regs.h |  2 ++
 include/phb4.h      |  2 ++
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/hw/phb4.c b/hw/phb4.c
index 67983634c0..5578cb426c 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -152,6 +152,7 @@ static bool verbose_eeh;
 static bool pci_tracing;
 static bool pci_eeh_mmio;
 static bool pci_retry_all;
+static int rx_err_max = PHB4_RX_ERR_MAX;
 
 /* Note: The "ASB" name is historical, practically this means access via
  * the XSCOM backdoor
@@ -2672,11 +2673,12 @@ static void phb4_lane_eq_change(struct phb4 *p, uint32_t vdid)
 static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
 {
 	struct phb4 *p = phb_to_phb4(slot->phb);
+	uint64_t reg;
 	uint32_t id;
-	uint16_t bdfn;
-	uint8_t trained_speed, phb_speed, dev_speed, target_speed;
+	uint16_t bdfn, lane_errs;
+	uint8_t trained_speed, phb_speed, dev_speed, target_speed, rx_errs;
 	uint8_t trained_width, phb_width, dev_width, target_width;
-	bool optimal_speed, optimal_width, optimal, retry_enabled;
+	bool optimal_speed, optimal_width, optimal, retry_enabled, rx_err_ok;
 
 
 	/* Current trained state */
@@ -2702,6 +2704,11 @@ static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
 	retry_enabled = (phb4_chip_retry_workaround() &&
 			 phb4_adapter_in_whitelist(id)) ||
 		phb4_lane_eq_retry_whitelist(id);
+	reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_COUNTERS);
+	rx_errs =  GETFIELD(PHB_PCIE_DLP_RX_ERR_CNT, reg);
+	rx_err_ok = (rx_errs < rx_err_max);
+	reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_STATUS);
+	lane_errs = GETFIELD(PHB_PCIE_DLP_LANE_ERR, reg);
 
 	PHBDBG(p, "LINK: Card [%04x:%04x] %s Retry:%s\n", VENDOR(id),
 	       DEVICE(id), optimal ? "Optimal" : "Degraded",
@@ -2710,10 +2717,16 @@ static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
 	       trained_speed, phb_speed, dev_speed, optimal_speed ? "" : " *");
 	PHBDBG(p, "LINK: Width Train:x%02i PHB:x%02i DEV:x%02i%s\n",
 	       trained_width, phb_width, dev_width, optimal_width ? "" : " *");
+	PHBDBG(p, "LINK: RX Errors Now:%i Max:%i Lane:0x%04x%s\n",
+	       rx_errs, rx_err_max, lane_errs, rx_err_ok ? "" : " *");
 
 	if (vdid)
 		*vdid = id;
 
+	/* Always do RX error retry irrespective of chip and card */
+	if (!rx_err_ok)
+		return false;
+
 	if (!retry_enabled)
 		return true;
 
@@ -5778,6 +5791,7 @@ static void phb4_probe_pbcq(struct dt_node *pbcq)
 void probe_phb4(void)
 {
 	struct dt_node *np;
+	const char *s;
 
 	verbose_eeh = nvram_query_eq("pci-eeh-verbose", "true");
 	/* REMOVEME: force this for now until we stabalise PCIe */
@@ -5788,6 +5802,15 @@ void probe_phb4(void)
 	pci_tracing = nvram_query_eq("pci-tracing", "true");
 	pci_eeh_mmio = !nvram_query_eq("pci-eeh-mmio", "disabled");
 	pci_retry_all = nvram_query_eq("pci-retry-all", "true");
+	s = nvram_query("phb-rx-err-max");
+	if (s) {
+		rx_err_max = atoi(s);
+
+		/* Clip to uint8_t used by hardware */
+		rx_err_max = MAX(rx_err_max, 0);
+		rx_err_max = MIN(rx_err_max, 255);
+	}
+	prlog(PR_DEBUG, "PHB4: Maximum RX errors during training: %d\n", rx_err_max);
 
 	/* Look for PBCQ XSCOM nodes */
 	dt_for_each_compatible(dt_root, np, "ibm,power9-pbcq")
diff --git a/include/phb4-regs.h b/include/phb4-regs.h
index ef3cfa93ec..28f22f9559 100644
--- a/include/phb4-regs.h
+++ b/include/phb4-regs.h
@@ -312,7 +312,9 @@
 #define PHB_PCIE_DLP_ERRLOG1		0x1AA0
 #define PHB_PCIE_DLP_ERRLOG2		0x1AA8
 #define PHB_PCIE_DLP_ERR_STATUS		0x1AB0
+#define  PHB_PCIE_DLP_LANE_ERR	 	PPC_BITMASK(0,15)
 #define PHB_PCIE_DLP_ERR_COUNTERS	0x1AB8
+#define  PHB_PCIE_DLP_RX_ERR_CNT 	PPC_BITMASK(16,23)
 
 #define PHB_PCIE_LANE_EQ_CNTL0		0x1AD0
 #define PHB_PCIE_LANE_EQ_CNTL1		0x1AD8
diff --git a/include/phb4.h b/include/phb4.h
index d78bc31752..43819d57b8 100644
--- a/include/phb4.h
+++ b/include/phb4.h
@@ -159,6 +159,8 @@ struct phb4_err {
 #define PHB4_LINK_ELECTRICAL_RETRIES	100
 #define PHB4_LINK_WAIT_RETRIES		200
 
+#define PHB4_RX_ERR_MAX			8
+
 /* PHB4 flags */
 #define PHB4_AIB_FENCED		0x00000001
 #define PHB4_CFG_USE_ASB	0x00000002
-- 
2.17.2



More information about the Skiboot mailing list