[Skiboot] [PATCH v3] phb4: Check for RX errors after link training
Michael Neuling
mikey at neuling.org
Tue Oct 30 11:02:30 AEDT 2018
From: Oliver O'Halloran <oohall at gmail.com>
Some PHB4 PHYs can get stuck in a bad state where they are constantly
retraining the link. This happens transparently to skiboot and Linux
but will causes PCIe to be slow. Resetting the PHB4 clears the
problem.
We can detect this case by looking at the RX errors count where we
check for link stability. This patch does this by modifying the link
optimal code to check for RX errors. If errors are occurring we
retrain the link irrespective of the chip rev or card.
Normally when this problem occurs, the RX error count is maxed out at
255. When there is no problem, the count is 0. We chose 8 as the max
rx errors value to give us some margin for a few errors. There is also
a knob that can be used to set the error threshold for when we should
retrain the link. ie
nvram -p ibm,skiboot --update-config phb-rx-err-max=8
Signed-off-by: Oliver O'Halloran <oohall at gmail.com>
Signed-off-by: Michael Neuling <mikey at neuling.org>
---
v3:
- Minor printing change.
- Handle int vs uint_8 conversion properly
v2:
- Print info about which lane caused the error.
- No functional change.
---
hw/phb4.c | 29 ++++++++++++++++++++++++++---
include/phb4-regs.h | 2 ++
include/phb4.h | 2 ++
3 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/hw/phb4.c b/hw/phb4.c
index 67983634c0..5578cb426c 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -152,6 +152,7 @@ static bool verbose_eeh;
static bool pci_tracing;
static bool pci_eeh_mmio;
static bool pci_retry_all;
+static int rx_err_max = PHB4_RX_ERR_MAX;
/* Note: The "ASB" name is historical, practically this means access via
* the XSCOM backdoor
@@ -2672,11 +2673,12 @@ static void phb4_lane_eq_change(struct phb4 *p, uint32_t vdid)
static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
{
struct phb4 *p = phb_to_phb4(slot->phb);
+ uint64_t reg;
uint32_t id;
- uint16_t bdfn;
- uint8_t trained_speed, phb_speed, dev_speed, target_speed;
+ uint16_t bdfn, lane_errs;
+ uint8_t trained_speed, phb_speed, dev_speed, target_speed, rx_errs;
uint8_t trained_width, phb_width, dev_width, target_width;
- bool optimal_speed, optimal_width, optimal, retry_enabled;
+ bool optimal_speed, optimal_width, optimal, retry_enabled, rx_err_ok;
/* Current trained state */
@@ -2702,6 +2704,11 @@ static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
retry_enabled = (phb4_chip_retry_workaround() &&
phb4_adapter_in_whitelist(id)) ||
phb4_lane_eq_retry_whitelist(id);
+ reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_COUNTERS);
+ rx_errs = GETFIELD(PHB_PCIE_DLP_RX_ERR_CNT, reg);
+ rx_err_ok = (rx_errs < rx_err_max);
+ reg = in_be64(p->regs + PHB_PCIE_DLP_ERR_STATUS);
+ lane_errs = GETFIELD(PHB_PCIE_DLP_LANE_ERR, reg);
PHBDBG(p, "LINK: Card [%04x:%04x] %s Retry:%s\n", VENDOR(id),
DEVICE(id), optimal ? "Optimal" : "Degraded",
@@ -2710,10 +2717,16 @@ static bool phb4_link_optimal(struct pci_slot *slot, uint32_t *vdid)
trained_speed, phb_speed, dev_speed, optimal_speed ? "" : " *");
PHBDBG(p, "LINK: Width Train:x%02i PHB:x%02i DEV:x%02i%s\n",
trained_width, phb_width, dev_width, optimal_width ? "" : " *");
+ PHBDBG(p, "LINK: RX Errors Now:%i Max:%i Lane:0x%04x%s\n",
+ rx_errs, rx_err_max, lane_errs, rx_err_ok ? "" : " *");
if (vdid)
*vdid = id;
+ /* Always do RX error retry irrespective of chip and card */
+ if (!rx_err_ok)
+ return false;
+
if (!retry_enabled)
return true;
@@ -5778,6 +5791,7 @@ static void phb4_probe_pbcq(struct dt_node *pbcq)
void probe_phb4(void)
{
struct dt_node *np;
+ const char *s;
verbose_eeh = nvram_query_eq("pci-eeh-verbose", "true");
/* REMOVEME: force this for now until we stabalise PCIe */
@@ -5788,6 +5802,15 @@ void probe_phb4(void)
pci_tracing = nvram_query_eq("pci-tracing", "true");
pci_eeh_mmio = !nvram_query_eq("pci-eeh-mmio", "disabled");
pci_retry_all = nvram_query_eq("pci-retry-all", "true");
+ s = nvram_query("phb-rx-err-max");
+ if (s) {
+ rx_err_max = atoi(s);
+
+ /* Clip to uint8_t used by hardware */
+ rx_err_max = MAX(rx_err_max, 0);
+ rx_err_max = MIN(rx_err_max, 255);
+ }
+ prlog(PR_DEBUG, "PHB4: Maximum RX errors during training: %d\n", rx_err_max);
/* Look for PBCQ XSCOM nodes */
dt_for_each_compatible(dt_root, np, "ibm,power9-pbcq")
diff --git a/include/phb4-regs.h b/include/phb4-regs.h
index ef3cfa93ec..28f22f9559 100644
--- a/include/phb4-regs.h
+++ b/include/phb4-regs.h
@@ -312,7 +312,9 @@
#define PHB_PCIE_DLP_ERRLOG1 0x1AA0
#define PHB_PCIE_DLP_ERRLOG2 0x1AA8
#define PHB_PCIE_DLP_ERR_STATUS 0x1AB0
+#define PHB_PCIE_DLP_LANE_ERR PPC_BITMASK(0,15)
#define PHB_PCIE_DLP_ERR_COUNTERS 0x1AB8
+#define PHB_PCIE_DLP_RX_ERR_CNT PPC_BITMASK(16,23)
#define PHB_PCIE_LANE_EQ_CNTL0 0x1AD0
#define PHB_PCIE_LANE_EQ_CNTL1 0x1AD8
diff --git a/include/phb4.h b/include/phb4.h
index d78bc31752..43819d57b8 100644
--- a/include/phb4.h
+++ b/include/phb4.h
@@ -159,6 +159,8 @@ struct phb4_err {
#define PHB4_LINK_ELECTRICAL_RETRIES 100
#define PHB4_LINK_WAIT_RETRIES 200
+#define PHB4_RX_ERR_MAX 8
+
/* PHB4 flags */
#define PHB4_AIB_FENCED 0x00000001
#define PHB4_CFG_USE_ASB 0x00000002
--
2.17.2
More information about the Skiboot
mailing list