[PATCH RFC 09/15] powerpw/eeh: Sync eeh_handle_special_event(), pnv_eeh_get_pe(), pnv_eeh_next_error()
Sam Bobroff
sbobroff at linux.ibm.com
Wed Oct 2 16:02:47 AEST 2019
Synchronize access to eeh_pe.
Signed-off-by: Sam Bobroff <sbobroff at linux.ibm.com>
---
arch/powerpc/kernel/eeh_driver.c | 15 +++++---
arch/powerpc/platforms/powernv/eeh-powernv.c | 38 ++++++++++++++++----
2 files changed, 43 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index c9d73070793e..bc5d58bf3904 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1184,6 +1184,7 @@ void eeh_handle_special_event(void)
do {
+ /* Acquire ref if rc == _FROZEN_PE, _FENCED_PHB or _DEAD_PHB */
rc = eeh_ops->next_error(&pe);
switch (rc) {
@@ -1195,10 +1196,11 @@ void eeh_handle_special_event(void)
eeh_remove_event(NULL, true);
list_for_each_entry(hose, &hose_list, list_node) {
- phb_pe = eeh_phb_pe_get(hose);
+ phb_pe = eeh_phb_pe_get(hose); /* Acquire ref */
if (!phb_pe) continue;
eeh_pe_mark_isolated(phb_pe);
+ eeh_put_pe(phb_pe); /* Release ref */
}
eeh_serialize_unlock(flags);
@@ -1236,15 +1238,17 @@ void eeh_handle_special_event(void)
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
rc == EEH_NEXT_ERR_FENCED_PHB) {
eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
- eeh_handle_normal_event(pe);
+ eeh_handle_normal_event(pe); /* Give ref */
} else {
pci_lock_rescan_remove();
list_for_each_entry(hose, &hose_list, list_node) {
- phb_pe = eeh_phb_pe_get(hose);
+ phb_pe = eeh_phb_pe_get(hose); /* Acquire ref */
if (!phb_pe ||
!(phb_pe->state & EEH_PE_ISOLATED) ||
- (phb_pe->state & EEH_PE_RECOVERING))
+ (phb_pe->state & EEH_PE_RECOVERING)) {
+ eeh_put_pe(phb_pe); /* Release ref */
continue;
+ }
eeh_for_each_pe(pe, tmp_pe)
eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
@@ -1263,11 +1267,14 @@ void eeh_handle_special_event(void)
__func__,
pe->phb->global_number,
pe->addr);
+ eeh_put_pe(phb_pe); /* Release ref */
break;
}
pci_hp_remove_devices(bus);
+ eeh_put_pe(phb_pe); /* Release ref */
}
pci_unlock_rescan_remove();
+ eeh_put_pe(pe); /* Release ref */
}
/*
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index e477e0b70968..c56a796dd894 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1404,6 +1404,7 @@ static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose)
}
}
+/* A return of 0 indicates that *pe is set, and referenced. */
static int pnv_eeh_get_pe(struct pci_controller *hose,
u16 pe_no, struct eeh_pe **pe)
{
@@ -1431,6 +1432,7 @@ static int pnv_eeh_get_pe(struct pci_controller *hose,
/* Freeze the (compound) PE */
*pe = dev_pe;
+ eeh_get_pe(*pe); /* Acquire ref */
if (!(dev_pe->state & EEH_PE_ISOLATED))
phb->freeze_pe(phb, pe_no);
@@ -1439,23 +1441,26 @@ static int pnv_eeh_get_pe(struct pci_controller *hose,
* have been frozen. However, we still need poke until
* hitting the frozen PE on top level.
*/
- dev_pe = dev_pe->parent;
+ eeh_pe_move_to_parent(&dev_pe);
while (dev_pe && !(dev_pe->type & EEH_PE_PHB)) {
int ret;
ret = eeh_ops->get_state(dev_pe, NULL);
if (ret <= 0 || eeh_state_active(ret)) {
- dev_pe = dev_pe->parent;
+ eeh_pe_move_to_parent(&dev_pe);
continue;
}
/* Frozen parent PE */
+ eeh_put_pe(*pe); /* Release ref */
*pe = dev_pe;
+ eeh_get_pe(*pe); /* Acquire ref */
if (!(dev_pe->state & EEH_PE_ISOLATED))
phb->freeze_pe(phb, dev_pe->addr);
/* Next one */
- dev_pe = dev_pe->parent;
+ eeh_pe_move_to_parent(&dev_pe);
}
+ eeh_put_pe(dev_pe);
return 0;
}
@@ -1469,6 +1474,8 @@ static int pnv_eeh_get_pe(struct pci_controller *hose,
* OPAL APIs for next error to handle. The informational error is
* handled internally by platform. However, the dead IOC, dead PHB,
* fenced PHB and frozen PE should be handled by EEH core eventually.
+ * On return, *pe will be ref'd iff returning _FROZEN_PE, _FENCED_PHB or
+ * _DEAD_PHB.
*/
static int pnv_eeh_next_error(struct eeh_pe **pe)
{
@@ -1479,6 +1486,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
__be16 err_type, severity;
long rc;
int state, ret = EEH_NEXT_ERR_NONE;
+ unsigned long flags;
/*
* While running here, it's safe to purge the event queue. The
@@ -1493,9 +1501,11 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
* needn't take care of it any more.
*/
phb = hose->private_data;
- phb_pe = eeh_phb_pe_get(hose);
- if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
+ phb_pe = eeh_phb_pe_get(hose); /* Acquire ref */
+ if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED)) {
+ eeh_put_pe(phb_pe); /* Release ref */
continue;
+ }
rc = opal_pci_next_error(phb->opal_id,
&frozen_pe_no, &err_type, &severity);
@@ -1503,6 +1513,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
pr_devel("%s: Invalid return value on "
"PHB#%x (0x%lx) from opal_pci_next_error",
__func__, hose->global_number, rc);
+ eeh_put_pe(phb_pe); /* Release ref */
continue;
}
@@ -1511,6 +1522,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) {
pr_devel("%s: No error found on PHB#%x\n",
__func__, hose->global_number);
+ eeh_put_pe(phb_pe); /* Release ref */
continue;
}
@@ -1539,19 +1551,23 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
case OPAL_EEH_PHB_ERROR:
if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) {
*pe = phb_pe;
+ eeh_get_pe(*pe); /* Acquire ref */
pr_err("EEH: dead PHB#%x detected, "
"location: %s\n",
hose->global_number,
eeh_pe_loc_get(phb_pe));
ret = EEH_NEXT_ERR_DEAD_PHB;
+ /* Retain ref on pe */
} else if (be16_to_cpu(severity) ==
OPAL_EEH_SEV_PHB_FENCED) {
*pe = phb_pe;
+ eeh_get_pe(*pe); /* Acquire ref */
pr_err("EEH: Fenced PHB#%x detected, "
"location: %s\n",
hose->global_number,
eeh_pe_loc_get(phb_pe));
ret = EEH_NEXT_ERR_FENCED_PHB;
+ /* Retain ref on pe */
} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
pr_info("EEH: PHB#%x informative error "
"detected, location: %s\n",
@@ -1568,8 +1584,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
* If we can't find the corresponding PE, we
* just try to unfreeze.
*/
+ /* Maybe acquire ref */
if (pnv_eeh_get_pe(hose,
be64_to_cpu(frozen_pe_no), pe)) {
+ /* 'pe' was not set by pnv_eeh_get_pe() */
pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
hose->global_number, be64_to_cpu(frozen_pe_no));
pr_info("EEH: PHB location: %s\n",
@@ -1589,6 +1607,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
ret = EEH_NEXT_ERR_NONE;
} else if ((*pe)->state & EEH_PE_ISOLATED ||
eeh_pe_passed(*pe)) {
+ eeh_put_pe(*pe); /* Release ref */
ret = EEH_NEXT_ERR_NONE;
} else {
pr_err("EEH: Frozen PE#%x "
@@ -1600,6 +1619,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
eeh_pe_loc_get(*pe),
eeh_pe_loc_get(phb_pe));
ret = EEH_NEXT_ERR_FROZEN_PE;
+ /* Retain ref on pe */
}
break;
@@ -1631,7 +1651,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
* we need have to handle frozen parent PE firstly.
*/
if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+ eeh_lock_pes(&flags);
parent_pe = (*pe)->parent;
+ eeh_get_pe(parent_pe);
+ eeh_unlock_pes(flags);
while (parent_pe) {
/* Hit the ceiling ? */
if (parent_pe->type & EEH_PE_PHB)
@@ -1643,13 +1666,15 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
*pe = parent_pe;
/* Next parent level */
- parent_pe = parent_pe->parent;
+ eeh_pe_move_to_parent(&parent_pe);
}
+ eeh_put_pe(parent_pe); /* Release ref (for early-out) */
/* We possibly migrate to another PE */
eeh_pe_mark_isolated(*pe);
}
+ eeh_put_pe(phb_pe); /* Release ref */
/*
* If we have no errors on the specific PHB or only
* informative error there, we continue poking it.
@@ -1664,6 +1689,7 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
if (ret == EEH_NEXT_ERR_NONE && eeh_enabled())
enable_irq(eeh_event_irq);
+ /* *pe may be ref'd, see above */
return ret;
}
--
2.22.0.216.g00a2a96fc9
More information about the Linuxppc-dev
mailing list