[PATCH v2] cxl: Wrap iterations over afu slices inside 'afu_list_lock'
Frederic Barrat
fbarrat at linux.ibm.com
Mon Jan 28 19:42:50 AEDT 2019
Hi Vaibhav,
2 comments below (one of which I missed on the previous iteration, sorry).
Le 26/01/2019 à 12:46, Vaibhav Jain a écrit :
> Within cxl module, iteration over array 'adapter->slices' may be racy
> at few points as it might be simultaneously read during an EEH and its
> contents being set to NULL while driver is being unloaded or unbound
> from the adapter. This might result in a NULL pointer to 'struct afu'
> being de-referenced during an EEH thereby causing a kernel oops.
>
> This patch fixes this by making sure that all access to the array
> 'adapter->slices' is wrapped within the context of spin-lock
> 'adapter->afu_list_lock'.
>
> Signed-off-by: Vaibhav Jain <vaibhav at linux.ibm.com>
> ---
> Changelog:
>
> v2:
> * Fixed a wrong comparison of non-null pointer [Fred]
> * Moved a call to cxl_vphb_error_detected() within a branch that
> checks for not null AFU pointer in 'adapter->slices' [Fred]
> * Removed a misleading comment in code.
> ---
> drivers/misc/cxl/guest.c | 2 ++
> drivers/misc/cxl/pci.c | 43 ++++++++++++++++++++++++++++------------
> 2 files changed, 32 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
> index 5d28d9e454f5..08f4a512afad 100644
> --- a/drivers/misc/cxl/guest.c
> +++ b/drivers/misc/cxl/guest.c
> @@ -267,6 +267,7 @@ static int guest_reset(struct cxl *adapter)
> int i, rc;
>
> pr_devel("Adapter reset request\n");
> + spin_lock(&adapter->afu_list_lock);
> for (i = 0; i < adapter->slices; i++) {
> if ((afu = adapter->afu[i])) {
> pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
> @@ -283,6 +284,7 @@ static int guest_reset(struct cxl *adapter)
> pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
> }
> }
> + spin_unlock(&adapter->afu_list_lock);
> return rc;
> }
>
> diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
> index c79ba1c699ad..ca968a889425 100644
> --- a/drivers/misc/cxl/pci.c
> +++ b/drivers/misc/cxl/pci.c
> @@ -1805,7 +1805,7 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
> /* There should only be one entry, but go through the list
> * anyway
> */
> - if (afu->phb == NULL)
> + if (afu == NULL || afu->phb == NULL)
> return result;
>
> list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
> @@ -1832,7 +1832,8 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
> {
> struct cxl *adapter = pci_get_drvdata(pdev);
> struct cxl_afu *afu;
> - pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result;
> + pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
> + pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
> int i;
>
> /* At this point, we could still have an interrupt pending.
> @@ -1843,6 +1844,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
>
> /* If we're permanently dead, give up. */
> if (state == pci_channel_io_perm_failure) {
> + spin_lock(&adapter->afu_list_lock);
> for (i = 0; i < adapter->slices; i++) {
> afu = adapter->afu[i];
> /*
> @@ -1851,6 +1853,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
> */
> cxl_vphb_error_detected(afu, state);
> }
> + spin_unlock(&adapter->afu_list_lock);
> return PCI_ERS_RESULT_DISCONNECT;
> }
>
> @@ -1932,14 +1935,19 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
> * * In slot_reset, free the old resources and allocate new ones.
> * * In resume, clear the flag to allow things to start.
> */
> +
> + /* Make sure no one else changes the afu list */
> + spin_lock(&adapter->afu_list_lock);
> +
> for (i = 0; i < adapter->slices; i++) {
> afu = adapter->afu[i];
>
> - afu_result = cxl_vphb_error_detected(afu, state);
> -
> - cxl_context_detach_all(afu);
> - cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
> - pci_deconfigure_afu(afu);
> + if (afu != NULL) {
> + afu_result = cxl_vphb_error_detected(afu, state);
> + cxl_context_detach_all(afu);
> + cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
> + pci_deconfigure_afu(afu);
> + }
>
> /* Disconnect trumps all, NONE trumps NEED_RESET */
> if (afu_result == PCI_ERS_RESULT_DISCONNECT)
Thanks for moving the call to cxl_vphb_error_detected(), but now, the
"if (afu_result == PCI_ERS_RESULT_DISCONNECT)" test looks like it should
also be part of the "if (afu != NULL)" statement (and then you wouldn't
have hit the warning about uninitialized afu_result). Current code would
work, but looks awkward since there's no need to check afu_result at
each iteration if afu is NULL.
> @@ -1948,6 +1956,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
> (result == PCI_ERS_RESULT_NEED_RESET))
> result = PCI_ERS_RESULT_NONE;
> }
> + spin_unlock(&adapter->afu_list_lock);
>
> /* should take the context lock here */
> if (cxl_adapter_context_lock(adapter) != 0)
> @@ -1980,14 +1989,15 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
> */
> cxl_adapter_context_unlock(adapter);
>
> + spin_lock(&adapter->afu_list_lock);
> for (i = 0; i < adapter->slices; i++) {
> afu = adapter->afu[i];
>
We're missing here:
if (afu == NULL)
continue;
Not introduced with this patch, but since you're fixing a few similar
cases...
Thanks,
Fred
> if (pci_configure_afu(afu, adapter, pdev))
> - goto err;
> + goto err_unlock;
>
> if (cxl_afu_select_best_mode(afu))
> - goto err;
> + goto err_unlock;
>
> if (afu->phb == NULL)
> continue;
> @@ -1999,16 +2009,16 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
> ctx = cxl_get_context(afu_dev);
>
> if (ctx && cxl_release_context(ctx))
> - goto err;
> + goto err_unlock;
>
> ctx = cxl_dev_context_init(afu_dev);
> if (IS_ERR(ctx))
> - goto err;
> + goto err_unlock;
>
> afu_dev->dev.archdata.cxl_ctx = ctx;
>
> if (cxl_ops->afu_check_and_enable(afu))
> - goto err;
> + goto err_unlock;
>
> afu_dev->error_state = pci_channel_io_normal;
>
> @@ -2029,8 +2039,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
> result = PCI_ERS_RESULT_DISCONNECT;
> }
> }
> +
> + spin_unlock(&adapter->afu_list_lock);
> return result;
>
> +err_unlock:
> + spin_unlock(&adapter->afu_list_lock);
> +
> err:
> /* All the bits that happen in both error_detected and cxl_remove
> * should be idempotent, so we don't need to worry about leaving a mix
> @@ -2051,10 +2066,11 @@ static void cxl_pci_resume(struct pci_dev *pdev)
> * This is not the place to be checking if everything came back up
> * properly, because there's no return value: do that in slot_reset.
> */
> + spin_lock(&adapter->afu_list_lock);
> for (i = 0; i < adapter->slices; i++) {
> afu = adapter->afu[i];
>
> - if (afu->phb == NULL)
> + if (afu == NULL || afu->phb == NULL)
> continue;
>
> list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
> @@ -2063,6 +2079,7 @@ static void cxl_pci_resume(struct pci_dev *pdev)
> afu_dev->driver->err_handler->resume(afu_dev);
> }
> }
> + spin_unlock(&adapter->afu_list_lock);
> }
>
> static const struct pci_error_handlers cxl_err_handler = {
>
More information about the Linuxppc-dev
mailing list