[PATCH/RFC] ppc64: EEH + SCSI recovery (IPR only)

Brian King brking at us.ibm.com
Thu Feb 24 06:34:56 EST 2005


Linas Vepstas wrote:
> ===== drivers/scsi/ipr.c 1.31 vs edited =====
> --- 1.31/drivers/scsi/ipr.c	2004-12-14 17:06:35 -06:00
> +++ edited/drivers/scsi/ipr.c	2005-02-22 17:37:41 -06:00
> @@ -80,6 +80,8 @@
>  #include <scsi/scsi_eh.h>
>  #include <scsi/scsi_cmnd.h>
>  #include <scsi/scsi_request.h>
> +
> +#define CONFIG_SCSI_IPR_EEH

This will obviously need to get cleaned up. This config option should
go away eventually.. I am assuming it is your way of flagging the
parts of code you have changed...

>  #include "ipr.h"
>  
>  /*
> @@ -2917,7 +2919,6 @@ static int ipr_eh_host_reset(struct scsi
>  
>  	if (WAIT_FOR_DUMP == ioa_cfg->sdt_state)
>  		ioa_cfg->sdt_state = GET_DUMP;
> -
>  	rc = ipr_reset_reload(ioa_cfg, IPR_SHUTDOWN_ABBREV);

Don't delete blank lines...

>  	LEAVE;
> @@ -5007,6 +5008,67 @@ static int ipr_reset_start_bist(struct i
>  	return rc;
>  }
>  
> +#ifdef CONFIG_SCSI_IPR_EEH
> +
> +static int ipr_reset_shutdown_ioa(struct ipr_cmnd *ipr_cmd);
> +
> +#define IPR_WAIT_FOR_EEH_RESET (HZ)

This should go in ipr.h with all the other timeout literals

> +static int ipr_reset_poll_eeh_recovery(struct ipr_cmnd *ipr_cmd)
> +{
> +	struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg;
> +	int rc;
> +
> +	ENTER;
> +	if (ioa_cfg->wait_on_eeh_reset) {
> +		ipr_reset_start_timer(ipr_cmd, IPR_WAIT_FOR_EEH_RESET);
> +		rc = IPR_RC_JOB_RETURN;
> +	} else {
> +		ipr_cmd->job_step = ipr_reset_start_bist;
> +		rc = IPR_RC_JOB_CONTINUE;
> +	}
> +
> +	LEAVE;
> +	return rc;
> +}

I really don't like this polling.

> +
> +static void ipr_eeh_frozen (struct pci_dev *pdev, void * data)
> +{
> +	struct ipr_ioa_cfg *ioa_cfg = data;
> +	ioa_cfg->wait_on_eeh_reset = 1;
> +}

Probably don't need the second arg - void * data. You can get the
ioa_cfg pointer with pci_get_drvdata(pdev)

Also, this function should start the ipr reset job. You need to prevent
ipr from talking to the device. Maybe something like:


static int ipr_reset_frozen(struct ipr_cmnd *ipr_cmd)
{
	struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg;

	list_add_tail(&ipr_cmd->queue, &ipr_cmd->ioa_cfg->pending_q);
	ipr_cmd->done = ipr_reset_ioa_job;
	return IPR_RC_JOB_RETURN;
}

static void ipr_eeh_frozen(struct pci_dev *pdev)
{
	unsigned long host_lock_flags = 0;
	struct ipr_ioa_cfg *ioa_cfg = pci_get_drvdata(pdev);

	spin_lock_irqsave(ioa_cfg->host->host_lock, flags);
	_ipr_initiate_ioa_reset(ioa_cfg, ipr_reset_frozen, IPR_SHUTDOWN_NONE);
	spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags);
}

static void ipr_eeh_post_reset(struct pci_dev *pdev)
{
	unsigned long host_lock_flags = 0;
	struct ipr_ioa_cfg *ioa_cfg = pci_get_drvdata(pdev);

	spin_lock_irqsave(ioa_cfg->host->host_lock, flags);
	_ipr_initiate_ioa_reset(ioa_cfg, ipr_reset_restore_cfg_space, IPR_SHUTDOWN_NONE);
	spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags);
}

That would let you get rid of all the polling code and should simplify
the code a bit.

> +static void ipr_eeh_post_reset (struct pci_dev *pdev, void * data)
> +{
> +	struct ipr_ioa_cfg *ioa_cfg = data;
> +	ioa_cfg->wait_on_eeh_reset = 0;
> +}

Same comment as above.

> +
> +static void ipr_eeh_perm_failure (struct pci_dev *pdev, void * data)
> +{
> +	// struct ipr_ioa_cfg *ioa_cfg = data;
> +	
> +#if 0  // XXXXXXXXXXXXXXXXXXXXXXX
> +	ipr_cmd->job_step = ipr_reset_shutdown_ioa;
> +	rc = IPR_RC_JOB_CONTINUE;
> +#endif
> +}

This needs to "bringdown" the adapter, but not actually touch it. Basically
stuff like unblocking requests so that we fail them instead of hanging, etc.

> +
> +static void ipr_register_eeh_handlers (struct ipr_ioa_cfg *ioa_cfg)
> +{
> +	/* XXX borken memory management; this malloc not managed */
> +	struct eeh_recovery_ops *eeh_ops;
> +	eeh_ops = kmalloc (sizeof(struct eeh_recovery_ops), GFP_KERNEL);
> +	memset (eeh_ops, 0, sizeof(struct eeh_recovery_ops));
> +	eeh_ops->frozen = ipr_eeh_frozen;
> +	eeh_ops->post_reset = ipr_eeh_post_reset;
> +	eeh_ops->perm_failure = ipr_eeh_perm_failure;
> +	eeh_ops->data = ioa_cfg;
> +	eeh_register_recovery_ops (ioa_cfg->pdev, eeh_ops);
> +}

If this was generalized a bit more, you could add these function
pointers into the pci_driver struct so they could be statically
initialized in each driver.

>  /**
>   * ipr_reset_allowed - Query whether or not IOA can be reset
>   * @ioa_cfg:	ioa config struct
> @@ -5042,6 +5104,7 @@ static int ipr_reset_wait_to_start_bist(
>  	struct ipr_ioa_cfg *ioa_cfg = ipr_cmd->ioa_cfg;
>  	int rc = IPR_RC_JOB_RETURN;
>  
> +#ifndef CONFIG_SCSI_IPR_EEH
>  	if (!ipr_reset_allowed(ioa_cfg) && ipr_cmd->u.time_left) {
>  		ipr_cmd->u.time_left -= IPR_CHECK_FOR_RESET_TIMEOUT;
>  		ipr_reset_start_timer(ipr_cmd, IPR_CHECK_FOR_RESET_TIMEOUT);
> @@ -5049,6 +5112,21 @@ static int ipr_reset_wait_to_start_bist(
>  		ipr_cmd->job_step = ipr_reset_start_bist;
>  		rc = IPR_RC_JOB_CONTINUE;
>  	}
> +#else
> +	if (!ipr_reset_allowed(ioa_cfg) && ipr_cmd->u.time_left 
> +	    && !eeh_slot_is_isolated (ioa_cfg->pdev)) {
> +			  
> +		ipr_cmd->u.time_left -= IPR_CHECK_FOR_RESET_TIMEOUT;
> +		ipr_reset_start_timer(ipr_cmd, IPR_CHECK_FOR_RESET_TIMEOUT);
> +	} else {
> +		if (eeh_slot_is_isolated (ioa_cfg->pdev)) {
> +			ipr_cmd->job_step = ipr_reset_poll_eeh_recovery;
> +		} else {
> +			ipr_cmd->job_step = ipr_reset_start_bist;
> +		}
> +		rc = IPR_RC_JOB_CONTINUE;
> +	}
> +#endif

Not sure what you are trying to accomplish with this bit of code. Does
not seem necessary to me.


> @@ -5079,7 +5157,16 @@ static int ipr_reset_alert(struct ipr_cm
>  		writel(IPR_UPROCI_RESET_ALERT, ioa_cfg->regs.set_uproc_interrupt_reg);
>  		ipr_cmd->job_step = ipr_reset_wait_to_start_bist;
>  	} else {
> +#ifndef CONFIG_SCSI_IPR_EEH
>  		ipr_cmd->job_step = ipr_reset_start_bist;
> +#else
> +		if (eeh_slot_is_isolated (ioa_cfg->pdev)) {
> +			ipr_cmd->job_step = ipr_reset_poll_eeh_recovery;
> +			return IPR_RC_JOB_CONTINUE;
> +		} else {
> +			ipr_cmd->job_step = ipr_reset_start_bist;
> +		}
> +#endif
>  	}

This is not needed.


>  
>  	if (rc != PCIBIOS_SUCCESSFUL) {
>  		dev_err(&pdev->dev, "Failed to save PCI config space\n");
> ===== drivers/scsi/ipr.h 1.21 vs edited =====
> --- 1.21/drivers/scsi/ipr.h	2004-12-14 17:09:02 -06:00
> +++ edited/drivers/scsi/ipr.h	2005-02-22 15:52:36 -06:00
> @@ -833,6 +833,9 @@ struct ipr_ioa_cfg {
>  	u8 dump_taken:1;
>  	u8 allow_cmds:1;
>  	u8 allow_ml_add_del:1;
> +#ifdef CONFIG_SCSI_IPR_EEH
> +	u8 wait_on_eeh_reset:1;
> +#endif

If you make the changes I suggest above, you should be able to remove this flag
altogether.



-- 
Brian King
eServer Storage I/O
IBM Linux Technology Center




More information about the Linuxppc64-dev mailing list