[PATCH v3 02/11] cxl: Drop commands if the PCI channel is not in normal state

Cyril Bur cyrilbur at gmail.com
Wed Aug 12 16:35:14 AEST 2015


On Wed, 12 Aug 2015 10:48:11 +1000
Daniel Axtens <dja at axtens.net> wrote:

> If the PCI channel has gone down, don't attempt to poke the hardware.
> 
> We need to guard every time cxl_whatever_(read|write) is called. This
> is because a call to those functions will dereference an offset into an
> mmio register, and the mmio mappings get invalidated in the EEH
> teardown.
> 
> Check in the read/write functions in the header.
> We give them the same semantics as usual PCI operations:
>  - a write to a channel that is down is ignored.
>  - a read from a channel that is down returns all fs.
> 
> Also, we try to access the MMIO space of a vPHB device as part of the
> PCI disable path. Because that's a read that bypasses most of our usual
> checks, we handle it explicitly.
> 
> As far as user visible warnings go:
>  - Check link state in file ops, return -EIO if down.
>  - Be reasonably quiet if there's an error in a teardown path,
>    or when we already know the hardware is going down.
>  - Throw a big WARN if someone tries to start a CXL operation
>    while the card is down. This gives a useful stacktrace for
>    debugging whatever is doing that.
> 

My previous comments appear to have been added, making functions from those
macros was a good move. I can't speak too much for the exact function of the
patch but the code looks good.

Reviewed-by: Cyril Bur <cyrilbur at gmail.com>

> Signed-off-by: Daniel Axtens <dja at axtens.net>
> ---
>  drivers/misc/cxl/context.c |  6 +++-
>  drivers/misc/cxl/cxl.h     | 44 ++++++++++++++++++++++------
>  drivers/misc/cxl/file.c    | 19 +++++++++++++
>  drivers/misc/cxl/native.c  | 71 ++++++++++++++++++++++++++++++++++++++++++++--
>  drivers/misc/cxl/vphb.c    | 26 +++++++++++++++++
>  5 files changed, 154 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
> index 1287148629c0..615842115848 100644
> --- a/drivers/misc/cxl/context.c
> +++ b/drivers/misc/cxl/context.c
> @@ -193,7 +193,11 @@ int __detach_context(struct cxl_context *ctx)
>  	if (status != STARTED)
>  		return -EBUSY;
>  
> -	WARN_ON(cxl_detach_process(ctx));
> +	/* Only warn if we detached while the link was OK.
> +	 * If detach fails when hw is down, we don't care.
> +	 */
> +	WARN_ON(cxl_detach_process(ctx) &&
> +		cxl_adapter_link_ok(ctx->afu->adapter));
>  	flush_work(&ctx->fault_work); /* Only needed for dedicated process */
>  	put_pid(ctx->pid);
>  	cxl_ctx_put();
> diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
> index 6a93bfbcd826..9b9e89fd02cc 100644
> --- a/drivers/misc/cxl/cxl.h
> +++ b/drivers/misc/cxl/cxl.h
> @@ -531,6 +531,14 @@ struct cxl_process_element {
>  	__be32 software_state;
>  } __packed;
>  
> +static inline bool cxl_adapter_link_ok(struct cxl *cxl)
> +{
> +	struct pci_dev *pdev;
> +
> +	pdev = to_pci_dev(cxl->dev.parent);
> +	return !pci_channel_offline(pdev);
> +}
> +
>  static inline void __iomem *_cxl_p1_addr(struct cxl *cxl, cxl_p1_reg_t reg)
>  {
>  	WARN_ON(!cpu_has_feature(CPU_FTR_HVMODE));
> @@ -539,12 +547,16 @@ static inline void __iomem *_cxl_p1_addr(struct cxl *cxl, cxl_p1_reg_t reg)
>  
>  static inline void cxl_p1_write(struct cxl *cxl, cxl_p1_reg_t reg, u64 val)
>  {
> -	out_be64(_cxl_p1_addr(cxl, reg), val);
> +	if (likely(cxl_adapter_link_ok(cxl)))
> +		out_be64(_cxl_p1_addr(cxl, reg), val);
>  }
>  
>  static inline u64 cxl_p1_read(struct cxl *cxl, cxl_p1_reg_t reg)
>  {
> -	return in_be64(_cxl_p1_addr(cxl, reg));
> +	if (likely(cxl_adapter_link_ok(cxl)))
> +		return in_be64(_cxl_p1_addr(cxl, reg));
> +	else
> +		return ~0ULL;
>  }
>  
>  static inline void __iomem *_cxl_p1n_addr(struct cxl_afu *afu, cxl_p1n_reg_t reg)
> @@ -555,12 +567,16 @@ static inline void __iomem *_cxl_p1n_addr(struct cxl_afu *afu, cxl_p1n_reg_t reg
>  
>  static inline void cxl_p1n_write(struct cxl_afu *afu, cxl_p1n_reg_t reg, u64 val)
>  {
> -	out_be64(_cxl_p1n_addr(afu, reg), val);
> +	if (likely(cxl_adapter_link_ok(afu->adapter)))
> +		out_be64(_cxl_p1n_addr(afu, reg), val);
>  }
>  
>  static inline u64 cxl_p1n_read(struct cxl_afu *afu, cxl_p1n_reg_t reg)
>  {
> -	return in_be64(_cxl_p1n_addr(afu, reg));
> +	if (likely(cxl_adapter_link_ok(afu->adapter)))
> +		return in_be64(_cxl_p1n_addr(afu, reg));
> +	else
> +		return ~0ULL;
>  }
>  
>  static inline void __iomem *_cxl_p2n_addr(struct cxl_afu *afu, cxl_p2n_reg_t reg)
> @@ -570,22 +586,34 @@ static inline void __iomem *_cxl_p2n_addr(struct cxl_afu *afu, cxl_p2n_reg_t reg
>  
>  static inline void cxl_p2n_write(struct cxl_afu *afu, cxl_p2n_reg_t reg, u64 val)
>  {
> -	out_be64(_cxl_p2n_addr(afu, reg), val);
> +	if (likely(cxl_adapter_link_ok(afu->adapter)))
> +		out_be64(_cxl_p2n_addr(afu, reg), val);
>  }
>  
>  static inline u64 cxl_p2n_read(struct cxl_afu *afu, cxl_p2n_reg_t reg)
>  {
> -	return in_be64(_cxl_p2n_addr(afu, reg));
> +	if (likely(cxl_adapter_link_ok(afu->adapter)))
> +		return in_be64(_cxl_p2n_addr(afu, reg));
> +	else
> +		return ~0ULL;
>  }
>  
>  static inline u64 cxl_afu_cr_read64(struct cxl_afu *afu, int cr, u64 off)
>  {
> -	return in_le64((afu)->afu_desc_mmio + (afu)->crs_offset + ((cr) * (afu)->crs_len) + (off));
> +	if (likely(cxl_adapter_link_ok(afu->adapter)))
> +		return in_le64((afu)->afu_desc_mmio + (afu)->crs_offset +
> +			       ((cr) * (afu)->crs_len) + (off));
> +	else
> +		return ~0ULL;
>  }
>  
>  static inline u32 cxl_afu_cr_read32(struct cxl_afu *afu, int cr, u64 off)
>  {
> -	return in_le32((afu)->afu_desc_mmio + (afu)->crs_offset + ((cr) * (afu)->crs_len) + (off));
> +	if (likely(cxl_adapter_link_ok(afu->adapter)))
> +		return in_le32((afu)->afu_desc_mmio + (afu)->crs_offset +
> +			       ((cr) * (afu)->crs_len) + (off));
> +	else
> +		return 0xffffffff;
>  }
>  u16 cxl_afu_cr_read16(struct cxl_afu *afu, int cr, u64 off);
>  u8 cxl_afu_cr_read8(struct cxl_afu *afu, int cr, u64 off);
> diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
> index c8c8bfa2679b..57bdb473749f 100644
> --- a/drivers/misc/cxl/file.c
> +++ b/drivers/misc/cxl/file.c
> @@ -73,6 +73,11 @@ static int __afu_open(struct inode *inode, struct file *file, bool master)
>  	if (!afu->current_mode)
>  		goto err_put_afu;
>  
> +	if (!cxl_adapter_link_ok(adapter)) {
> +		rc = -EIO;
> +		goto err_put_afu;
> +	}
> +
>  	if (!(ctx = cxl_context_alloc())) {
>  		rc = -ENOMEM;
>  		goto err_put_afu;
> @@ -238,6 +243,9 @@ long afu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>  	if (ctx->status == CLOSED)
>  		return -EIO;
>  
> +	if (!cxl_adapter_link_ok(ctx->afu->adapter))
> +		return -EIO;
> +
>  	pr_devel("afu_ioctl\n");
>  	switch (cmd) {
>  	case CXL_IOCTL_START_WORK:
> @@ -265,6 +273,9 @@ int afu_mmap(struct file *file, struct vm_area_struct *vm)
>  	if (ctx->status != STARTED)
>  		return -EIO;
>  
> +	if (!cxl_adapter_link_ok(ctx->afu->adapter))
> +		return -EIO;
> +
>  	return cxl_context_iomap(ctx, vm);
>  }
>  
> @@ -309,6 +320,9 @@ ssize_t afu_read(struct file *file, char __user *buf, size_t count,
>  	int rc;
>  	DEFINE_WAIT(wait);
>  
> +	if (!cxl_adapter_link_ok(ctx->afu->adapter))
> +		return -EIO;
> +
>  	if (count < CXL_READ_MIN_SIZE)
>  		return -EINVAL;
>  
> @@ -319,6 +333,11 @@ ssize_t afu_read(struct file *file, char __user *buf, size_t count,
>  		if (ctx_event_pending(ctx))
>  			break;
>  
> +		if (!cxl_adapter_link_ok(ctx->afu->adapter)) {
> +			rc = -EIO;
> +			goto out;
> +		}
> +
>  		if (file->f_flags & O_NONBLOCK) {
>  			rc = -EAGAIN;
>  			goto out;
> diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
> index 0c94ba27e658..cd1dda5fcd3a 100644
> --- a/drivers/misc/cxl/native.c
> +++ b/drivers/misc/cxl/native.c
> @@ -41,6 +41,12 @@ static int afu_control(struct cxl_afu *afu, u64 command,
>  			rc = -EBUSY;
>  			goto out;
>  		}
> +		if (!cxl_adapter_link_ok(afu->adapter)) {
> +			afu->enabled = enabled;
> +			rc = -EIO;
> +			goto out;
> +		}
> +
>  		pr_devel_ratelimited("AFU control... (0x%.16llx)\n",
>  				     AFU_Cntl | command);
>  		cpu_relax();
> @@ -85,6 +91,10 @@ int __cxl_afu_reset(struct cxl_afu *afu)
>  
>  int cxl_afu_check_and_enable(struct cxl_afu *afu)
>  {
> +	if (!cxl_adapter_link_ok(afu->adapter)) {
> +		WARN(1, "Refusing to enable afu while link down!\n");
> +		return -EIO;
> +	}
>  	if (afu->enabled)
>  		return 0;
>  	return afu_enable(afu);
> @@ -103,6 +113,12 @@ int cxl_psl_purge(struct cxl_afu *afu)
>  
>  	pr_devel("PSL purge request\n");
>  
> +	if (!cxl_adapter_link_ok(afu->adapter)) {
> +		dev_warn(&afu->dev, "PSL Purge called with link down, ignoring\n");
> +		rc = -EIO;
> +		goto out;
> +	}
> +
>  	if ((AFU_Cntl & CXL_AFU_Cntl_An_ES_MASK) != CXL_AFU_Cntl_An_ES_Disabled) {
>  		WARN(1, "psl_purge request while AFU not disabled!\n");
>  		cxl_afu_disable(afu);
> @@ -119,6 +135,11 @@ int cxl_psl_purge(struct cxl_afu *afu)
>  			rc = -EBUSY;
>  			goto out;
>  		}
> +		if (!cxl_adapter_link_ok(afu->adapter)) {
> +			rc = -EIO;
> +			goto out;
> +		}
> +
>  		dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
>  		pr_devel_ratelimited("PSL purging... PSL_CNTL: 0x%.16llx  PSL_DSISR: 0x%.16llx\n", PSL_CNTL, dsisr);
>  		if (dsisr & CXL_PSL_DSISR_TRANS) {
> @@ -215,6 +236,8 @@ int cxl_tlb_slb_invalidate(struct cxl *adapter)
>  			dev_warn(&adapter->dev, "WARNING: CXL adapter wide TLBIA timed out!\n");
>  			return -EBUSY;
>  		}
> +		if (!cxl_adapter_link_ok(adapter))
> +			return -EIO;
>  		cpu_relax();
>  	}
>  
> @@ -224,6 +247,8 @@ int cxl_tlb_slb_invalidate(struct cxl *adapter)
>  			dev_warn(&adapter->dev, "WARNING: CXL adapter wide SLBIA timed out!\n");
>  			return -EBUSY;
>  		}
> +		if (!cxl_adapter_link_ok(adapter))
> +			return -EIO;
>  		cpu_relax();
>  	}
>  	return 0;
> @@ -240,6 +265,11 @@ int cxl_afu_slbia(struct cxl_afu *afu)
>  			dev_warn(&afu->dev, "WARNING: CXL AFU SLBIA timed out!\n");
>  			return -EBUSY;
>  		}
> +		/* If the adapter has gone down, we can assume that we
> +		 * will PERST it and that will invalidate everything.
> +		 */
> +		if (!cxl_adapter_link_ok(afu->adapter))
> +			return -EIO;
>  		cpu_relax();
>  	}
>  	return 0;
> @@ -279,6 +309,8 @@ static void slb_invalid(struct cxl_context *ctx)
>  	cxl_p1_write(adapter, CXL_PSL_SLBIA, CXL_TLB_SLB_IQ_LPIDPID);
>  
>  	while (1) {
> +		if (!cxl_adapter_link_ok(adapter))
> +			break;
>  		slbia = cxl_p1_read(adapter, CXL_PSL_SLBIA);
>  		if (!(slbia & CXL_TLB_SLB_P))
>  			break;
> @@ -308,6 +340,11 @@ static int do_process_element_cmd(struct cxl_context *ctx,
>  			rc = -EBUSY;
>  			goto out;
>  		}
> +		if (!cxl_adapter_link_ok(ctx->afu->adapter)) {
> +			dev_warn(&ctx->afu->dev, "WARNING: Device link down, aborting Process Element Command!\n");
> +			rc = -EIO;
> +			goto out;
> +		}
>  		state = be64_to_cpup(ctx->afu->sw_command_status);
>  		if (state == ~0ULL) {
>  			pr_err("cxl: Error adding process element to AFU\n");
> @@ -355,8 +392,13 @@ static int terminate_process_element(struct cxl_context *ctx)
>  
>  	mutex_lock(&ctx->afu->spa_mutex);
>  	pr_devel("%s Terminate pe: %i started\n", __func__, ctx->pe);
> -	rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_TERMINATE,
> -				    CXL_PE_SOFTWARE_STATE_V | CXL_PE_SOFTWARE_STATE_T);
> +	/* We could be asked to terminate when the hw is down. That
> +	 * should always succeed: it's not running if the hw has gone
> +	 * away and is being reset.
> +	 */
> +	if (cxl_adapter_link_ok(ctx->afu->adapter))
> +		rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_TERMINATE,
> +					    CXL_PE_SOFTWARE_STATE_V | CXL_PE_SOFTWARE_STATE_T);
>  	ctx->elem->software_state = 0;	/* Remove Valid bit */
>  	pr_devel("%s Terminate pe: %i finished\n", __func__, ctx->pe);
>  	mutex_unlock(&ctx->afu->spa_mutex);
> @@ -369,7 +411,14 @@ static int remove_process_element(struct cxl_context *ctx)
>  
>  	mutex_lock(&ctx->afu->spa_mutex);
>  	pr_devel("%s Remove pe: %i started\n", __func__, ctx->pe);
> -	if (!(rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_REMOVE, 0)))
> +
> +	/* We could be asked to remove when the hw is down. Again, if
> +	 * the hw is down, the PE is gone, so we succeed.
> +	 */
> +	if (cxl_adapter_link_ok(ctx->afu->adapter))
> +		rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_REMOVE, 0);
> +
> +	if (!rc)
>  		ctx->pe_inserted = false;
>  	slb_invalid(ctx);
>  	pr_devel("%s Remove pe: %i finished\n", __func__, ctx->pe);
> @@ -612,6 +661,11 @@ int cxl_afu_activate_mode(struct cxl_afu *afu, int mode)
>  	if (!(mode & afu->modes_supported))
>  		return -EINVAL;
>  
> +	if (!cxl_adapter_link_ok(afu->adapter)) {
> +		WARN(1, "Device link is down, refusing to activate!\n");
> +		return -EIO;
> +	}
> +
>  	if (mode == CXL_MODE_DIRECTED)
>  		return activate_afu_directed(afu);
>  	if (mode == CXL_MODE_DEDICATED)
> @@ -622,6 +676,11 @@ int cxl_afu_activate_mode(struct cxl_afu *afu, int mode)
>  
>  int cxl_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u64 amr)
>  {
> +	if (!cxl_adapter_link_ok(ctx->afu->adapter)) {
> +		WARN(1, "Device link is down, refusing to attach process!\n");
> +		return -EIO;
> +	}
> +
>  	ctx->kernel = kernel;
>  	if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
>  		return attach_afu_directed(ctx, wed, amr);
> @@ -666,6 +725,12 @@ int cxl_get_irq(struct cxl_afu *afu, struct cxl_irq_info *info)
>  {
>  	u64 pidtid;
>  
> +	/* If the adapter has gone away, we can't get any meaningful
> +	 * information.
> +	 */
> +	if (!cxl_adapter_link_ok(afu->adapter))
> +		return -EIO;
> +
>  	info->dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
>  	info->dar = cxl_p2n_read(afu, CXL_PSL_DAR_An);
>  	info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An);
> diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c
> index a7b55772a91c..d1c3509c925d 100644
> --- a/drivers/misc/cxl/vphb.c
> +++ b/drivers/misc/cxl/vphb.c
> @@ -138,6 +138,26 @@ static int cxl_pcie_config_info(struct pci_bus *bus, unsigned int devfn,
>  	return 0;
>  }
>  
> +
> +static inline bool cxl_config_link_ok(struct pci_bus *bus)
> +{
> +	struct pci_controller *phb;
> +	struct cxl_afu *afu;
> +
> +	/* Config space IO is based on phb->cfg_addr, which is based on
> +	 * afu_desc_mmio. This isn't safe to read/write when the link
> +	 * goes down, as EEH tears down MMIO space.
> +	 *
> +	 * Check if the link is OK before proceeding.
> +	 */
> +
> +	phb = pci_bus_to_host(bus);
> +	if (phb == NULL)
> +		return false;
> +	afu = (struct cxl_afu *)phb->private_data;
> +	return cxl_adapter_link_ok(afu->adapter);
> +}
> +
>  static int cxl_pcie_read_config(struct pci_bus *bus, unsigned int devfn,
>  				int offset, int len, u32 *val)
>  {
> @@ -150,6 +170,9 @@ static int cxl_pcie_read_config(struct pci_bus *bus, unsigned int devfn,
>  	if (rc)
>  		return rc;
>  
> +	if (!cxl_config_link_ok(bus))
> +		return PCIBIOS_DEVICE_NOT_FOUND;
> +
>  	/* Can only read 32 bits */
>  	*val = (in_le32(ioaddr) >> shift) & mask;
>  	return PCIBIOS_SUCCESSFUL;
> @@ -167,6 +190,9 @@ static int cxl_pcie_write_config(struct pci_bus *bus, unsigned int devfn,
>  	if (rc)
>  		return rc;
>  
> +	if (!cxl_config_link_ok(bus))
> +		return PCIBIOS_DEVICE_NOT_FOUND;
> +
>  	/* Can only write 32 bits so do read-modify-write */
>  	mask <<= shift;
>  	val <<= shift;



More information about the Linuxppc-dev mailing list