[PATCH 2/9] powerpc/eeh: Manage EEH_PE_RECOVERING inside eeh_handle_normal_event()
Daniel Axtens
dja at axtens.net
Tue Mar 6 12:47:57 AEDT 2018
Hi Sam,
> Currently the EEH_PE_RECOVERING flag for a PE is managed by both the
> caller and callee of eeh_handle_normal_event() (among other places not
> considered here). This is complicated by the fact that the PE may
> or may not have been invalidated by the call.
So, I applied this patch and looked at those places. They're now
restricted to eeh_driver.c and eeh.c.
The only other place it's marked is
eeh_driver.c::eeh_pe_reset_and_recover(), which is itself only called
from eeh.c::eeh_pe_change_owner(). That is only called from 2 places,
both in eeh.c - eeh_dev_open() and eeh_dev_release(). I have not yet
wrapped my head around the logic of that.
I suspect the entire logic can have some more simplifications, but this
is definitely a good step.
I also read through your patch and have no other comments, so:
Reviewed-by: Daniel Axtens <dja at axtens.net>
Regards,
Daniel
>
> So move the callee's handling into eeh_handle_normal_event(), which
> clarifies it and allows the return type to be changed to void (because
> it no longer needs to indicate at the PE has been invalidated).
>
> This should not change behaviour except in eeh_event_handler() where
> it was previously possible to cause eeh_pe_state_clear() to be called
> on an invalid PE, which is now avoided.
>
> Signed-off-by: Sam Bobroff <sam.bobroff at au1.ibm.com>
> ---
> arch/powerpc/include/asm/eeh_event.h | 2 +-
> arch/powerpc/kernel/eeh_driver.c | 29 +++++++++++------------------
> arch/powerpc/kernel/eeh_event.c | 2 --
> 3 files changed, 12 insertions(+), 21 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h
> index 0a168038882d..9884e872686f 100644
> --- a/arch/powerpc/include/asm/eeh_event.h
> +++ b/arch/powerpc/include/asm/eeh_event.h
> @@ -34,7 +34,7 @@ struct eeh_event {
> int eeh_event_init(void);
> int eeh_send_failure_event(struct eeh_pe *pe);
> void eeh_remove_event(struct eeh_pe *pe, bool force);
> -bool eeh_handle_normal_event(struct eeh_pe *pe);
> +void eeh_handle_normal_event(struct eeh_pe *pe);
> void eeh_handle_special_event(void);
>
> #endif /* __KERNEL__ */
> diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
> index 51b21c97910f..5b7a5ed4db4d 100644
> --- a/arch/powerpc/kernel/eeh_driver.c
> +++ b/arch/powerpc/kernel/eeh_driver.c
> @@ -733,7 +733,8 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
>
> /**
> * eeh_handle_normal_event - Handle EEH events on a specific PE
> - * @pe: EEH PE
> + * @pe: EEH PE - which should not be used after we return, as it may
> + * have been invalidated.
> *
> * Attempts to recover the given PE. If recovery fails or the PE has failed
> * too many times, remove the PE.
> @@ -750,10 +751,8 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
> * & devices under this slot, and then finally restarting the device
> * drivers (which cause a second set of hotplug events to go out to
> * userspace).
> - *
> - * Returns true if @pe should no longer be used, else false.
> */
> -bool eeh_handle_normal_event(struct eeh_pe *pe)
> +void eeh_handle_normal_event(struct eeh_pe *pe)
> {
> struct pci_bus *frozen_bus;
> struct eeh_dev *edev, *tmp;
> @@ -765,9 +764,11 @@ bool eeh_handle_normal_event(struct eeh_pe *pe)
> if (!frozen_bus) {
> pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
> __func__, pe->phb->global_number, pe->addr);
> - return false;
> + return;
> }
>
> + eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
> +
> eeh_pe_update_time_stamp(pe);
> pe->freeze_count++;
> if (pe->freeze_count > eeh_max_freezes) {
> @@ -904,7 +905,7 @@ bool eeh_handle_normal_event(struct eeh_pe *pe)
> pr_info("EEH: Notify device driver to resume\n");
> eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
>
> - return false;
> + goto final;
>
> hard_fail:
> /*
> @@ -940,12 +941,12 @@ bool eeh_handle_normal_event(struct eeh_pe *pe)
> pci_lock_rescan_remove();
> pci_hp_remove_devices(frozen_bus);
> pci_unlock_rescan_remove();
> -
> /* The passed PE should no longer be used */
> - return true;
> + return;
> }
> }
> - return false;
> +final:
> + eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
> }
>
> /**
> @@ -1018,15 +1019,7 @@ void eeh_handle_special_event(void)
> */
> if (rc == EEH_NEXT_ERR_FROZEN_PE ||
> rc == EEH_NEXT_ERR_FENCED_PHB) {
> - /*
> - * eeh_handle_normal_event() can make the PE stale if it
> - * determines that the PE cannot possibly be recovered.
> - * Don't modify the PE state if that's the case.
> - */
> - if (eeh_handle_normal_event(pe))
> - continue;
> -
> - eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
> + eeh_handle_normal_event(pe);
> } else {
> pci_lock_rescan_remove();
> list_for_each_entry(hose, &hose_list, list_node) {
> diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
> index 872bcfe8f90e..61c9356bf9c9 100644
> --- a/arch/powerpc/kernel/eeh_event.c
> +++ b/arch/powerpc/kernel/eeh_event.c
> @@ -73,7 +73,6 @@ static int eeh_event_handler(void * dummy)
> /* We might have event without binding PE */
> pe = event->pe;
> if (pe) {
> - eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
> if (pe->type & EEH_PE_PHB)
> pr_info("EEH: Detected error on PHB#%x\n",
> pe->phb->global_number);
> @@ -82,7 +81,6 @@ static int eeh_event_handler(void * dummy)
> "PHB#%x-PE#%x\n",
> pe->phb->global_number, pe->addr);
> eeh_handle_normal_event(pe);
> - eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
> } else {
> eeh_handle_special_event();
> }
> --
> 2.16.1.74.g9b0b1f47b
More information about the Linuxppc-dev
mailing list