[PATCH v3 2/2] pseries/mce: Refactor the pseries mce handling code

Nicholas Piggin npiggin at gmail.com
Thu Nov 25 00:10:27 AEDT 2021


Excerpts from Ganesh Goudar's message of November 24, 2021 7:55 pm:
> Now that we are no longer switching on the mmu in realmode
> mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
> Avoid using addr_to_pfn in real mode") partially, which
> introduced functions mce_handle_err_virtmode/realmode() to
> separate mce handler code which needed translation to enabled.
> 
> Signed-off-by: Ganesh Goudar <ganeshgr at linux.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/ras.c | 122 +++++++++++----------------
>  1 file changed, 49 insertions(+), 73 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
> index 8613f9cc5798..62e1519b8355 100644
> --- a/arch/powerpc/platforms/pseries/ras.c
> +++ b/arch/powerpc/platforms/pseries/ras.c
> @@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
>  	return 0; /* need to perform reset */
>  }
>  
> -static int mce_handle_err_realmode(int disposition, u8 error_type)
> -{
> -#ifdef CONFIG_PPC_BOOK3S_64
> -	if (disposition == RTAS_DISP_NOT_RECOVERED) {
> -		switch (error_type) {
> -		case	MC_ERROR_TYPE_ERAT:
> -			flush_erat();
> -			disposition = RTAS_DISP_FULLY_RECOVERED;
> -			break;
> -		case	MC_ERROR_TYPE_SLB:
> -			/*
> -			 * Store the old slb content in paca before flushing.
> -			 * Print this when we go to virtual mode.
> -			 * There are chances that we may hit MCE again if there
> -			 * is a parity error on the SLB entry we trying to read
> -			 * for saving. Hence limit the slb saving to single
> -			 * level of recursion.
> -			 */
> -			if (local_paca->in_mce == 1)
> -				slb_save_contents(local_paca->mce_faulty_slbs);
> -			flush_and_reload_slb();
> -			disposition = RTAS_DISP_FULLY_RECOVERED;
> -			break;
> -		default:
> -			break;
> -		}
> -	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
> -		/* Platform corrected itself but could be degraded */
> -		pr_err("MCE: limited recovery, system may be degraded\n");
> -		disposition = RTAS_DISP_FULLY_RECOVERED;
> -	}
> -#endif
> -	return disposition;
> -}
> -
> -static int mce_handle_err_virtmode(struct pt_regs *regs,
> -				   struct rtas_error_log *errp,
> -				   struct pseries_mc_errorlog *mce_log,
> -				   int disposition)
> +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>  {
>  	struct mce_error_info mce_err = { 0 };
> +	unsigned long eaddr = 0, paddr = 0;
> +	struct pseries_errorlog *pseries_log;
> +	struct pseries_mc_errorlog *mce_log;
> +	int disposition = rtas_error_disposition(errp);
>  	int initiator = rtas_error_initiator(errp);
>  	int severity = rtas_error_severity(errp);
> -	unsigned long eaddr = 0, paddr = 0;
>  	u8 error_type, err_sub_type;
>  
> -	if (!mce_log)
> -		goto out;
> -
> -	error_type = mce_log->error_type;
> -	err_sub_type = rtas_mc_error_sub_type(mce_log);
> -
>  	if (initiator == RTAS_INITIATOR_UNKNOWN)
>  		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
>  	else if (initiator == RTAS_INITIATOR_CPU)
> @@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>  		mce_err.severity = MCE_SEV_SEVERE;
>  	else if (severity == RTAS_SEVERITY_ERROR)
>  		mce_err.severity = MCE_SEV_SEVERE;
> +	else if (severity == RTAS_SEVERITY_FATAL)
> +		mce_err.severity = MCE_SEV_FATAL;
>  	else
>  		mce_err.severity = MCE_SEV_FATAL;
>  

What's this hunk for?

> @@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>  	mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
>  	mce_err.error_class = MCE_ECLASS_UNKNOWN;
>  
> -	switch (error_type) {
> +	if (!rtas_error_extended(errp))
> +		goto out;
> +
> +	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
> +	if (!pseries_log)
> +		goto out;
> +
> +	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
> +	error_type = mce_log->error_type;
> +	err_sub_type = rtas_mc_error_sub_type(mce_log);
> +
> +	switch (mce_log->error_type) {
>  	case MC_ERROR_TYPE_UE:
>  		mce_err.error_type = MCE_ERROR_TYPE_UE;
>  		mce_common_process_ue(regs, &mce_err);
> @@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>  		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
>  		break;
>  	case MC_ERROR_TYPE_I_CACHE:
> -		mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
> +		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
>  		break;

And this one. Doesn't look right.

>  	case MC_ERROR_TYPE_UNKNOWN:
>  	default:
>  		mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
>  		break;
>  	}
> +
> +#ifdef CONFIG_PPC_BOOK3S_64
> +	if (disposition == RTAS_DISP_NOT_RECOVERED) {
> +		switch (error_type) {
> +		case	MC_ERROR_TYPE_SLB:
> +		case	MC_ERROR_TYPE_ERAT:
> +			/*
> +			 * Store the old slb content in paca before flushing.
> +			 * Print this when we go to virtual mode.
> +			 * There are chances that we may hit MCE again if there
> +			 * is a parity error on the SLB entry we trying to read
> +			 * for saving. Hence limit the slb saving to single
> +			 * level of recursion.
> +			 */
> +			if (local_paca->in_mce == 1)
> +				slb_save_contents(local_paca->mce_faulty_slbs);
> +			flush_and_reload_slb();
> +			disposition = RTAS_DISP_FULLY_RECOVERED;
> +			break;
> +		default:
> +			break;
> +		}
> +	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
> +		/* Platform corrected itself but could be degraded */
> +		pr_err("MCE: limited recovery, system may be degraded\n");
> +		disposition = RTAS_DISP_FULLY_RECOVERED;
> +	}

I would prefer if you just keep the mce_handle_err_realmode function 
(can rename it if you want). It's actually changed a bit since the
patch being reverted so we don't want to undo that.

Thanks,
Nick


More information about the Linuxppc-dev mailing list