[PATCH v2] powerpc/eeh: parse AER registers

Narayana Murty N nnmlinux at linux.ibm.com
Tue Aug 26 13:55:47 AEST 2025


On 08/08/25 1:52 PM, Ganesh Goudar wrote:
> parse AER uncorrectable and correctable error status
> registers to print error type and severity.
>
> output looks like
> EEH:AER severity=Uncorrected (Fatal), Error Type: Data Link Protocol Error
>
> Signed-off-by: Ganesh Goudar <ganeshgr at linux.ibm.com>
> ---
> v2:
> * Remove unnecessary checks.
> * Change the error message format.
> ---
>   arch/powerpc/kernel/eeh.c | 81 ++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 80 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 83fe99861eb1..cd083e59d6b3 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -139,6 +139,49 @@ struct eeh_stats {
>   
>   static struct eeh_stats eeh_stats;
>   
> +static const char * const aer_uncor_errors[] = {
> +	"Undefined",
> +	"Undefined",
> +	"Undefined",
> +	"Undefined",
> +	"Data Link Protocol",
> +	"Surprise Down",
> +	"Poisoned TLP",
> +	"Flow Control Protocol",
> +	"Completion Timeout",
> +	"Completer Abort",
> +	"Unexpected Completion",
> +	"Receiver Overflow",
> +	"Malformed TLP",
> +	"ECRC Error",
> +	"Unsupported Request",
> +	"ACS Violation",
> +	"Uncorrectable Internal Error",
> +	"MC Blocked TLP",
> +	"AtomicOp Egress Blocked",
> +	"TLPPrefix Blocked",
> +	"Poisoned TLP Egress Blocked"
> +};
> +
> +static const char * const aer_cor_errors[] = {
> +	"Receiver Error",
> +	"Undefined",
> +	"Undefined",
> +	"Undefined",
> +	"Undefined",
> +	"Undefined",
> +	"Bad TLP",
> +	"Bad DLLP",
> +	"Replay Num Rollover",
> +	"Undefined",
> +	"Undefined",
> +	"Undefined",
> +	"Replay Timer Timeout",
> +	"Advisory Non-Fatal Error",
> +	"Corrected Internal Error",
> +	"Header Log Overflow",
> +};
> +
>   static int __init eeh_setup(char *str)
>   {
>   	if (!strcmp(str, "off"))
> @@ -160,6 +203,40 @@ void eeh_show_enabled(void)
>   		pr_info("EEH: No capable adapters found: recovery disabled.\n");
>   }
>   
> +static void eeh_parse_aer_registers(struct eeh_dev *edev, int cap)
> +{
> +	int i;
> +	const char *error_type;
> +	u32 uncor_status, uncor_severity, cor_status;
> +
> +	eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_STATUS, 4, &uncor_status);
> +	eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_SEVER, 4, &uncor_severity);
> +	eeh_ops->read_config(edev, cap + PCI_ERR_COR_STATUS, 4, &cor_status);
> +
> +	if (uncor_status) {
> +		for (i = 0; i < ARRAY_SIZE(aer_uncor_errors); i++) {
> +			if (uncor_status & (1 << i)) {
> +				error_type = (i < ARRAY_SIZE(aer_uncor_errors))
> +					     ? aer_uncor_errors[i] : "Unknown";
> +				pr_err("EEH:AER severity=Uncorrected (%s), Error type: %s\n",
> +				       (uncor_severity & (1 << i)) ?
> +				       "Fatal" : "Non-Fatal", error_type);
> +			}
> +		}
> +	}
> +
> +	if (cor_status) {
> +		for (i = 0; i < ARRAY_SIZE(aer_cor_errors); i++) {
> +			if (cor_status & (1 << i)) {
> +				error_type = (i < ARRAY_SIZE(aer_cor_errors))
> +					      ? aer_cor_errors[i] : "Unknown";
> +				pr_err("EEH:AER severity=Correctable, Error Type: %s\n",
> +				       error_type);
> +			}
> +		}
> +	}
> +}
> +
In the last version you missed my review comment.
If you include it would be better to also consider the AER mask 
registers when
interpreting error status. Otherwise masked-off bits may still appear in
logs, leading to false positives. For example, something like:

     eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_MASK, 4, &uncor_mask);
     eeh_ops->read_config(edev, cap + PCI_ERR_COR_MASK, 4, &cor_mask);

     if (uncor_status & ~uncor_mask) { ... }
     if (cor_status & ~cor_mask) { ... }

This way only unmasked errors are reported.

Regards,
Narayana Murty N

>   /*
>    * This routine captures assorted PCI configuration space data
>    * for the indicated PCI device, and puts them into a buffer
> @@ -237,9 +314,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
>   		pr_warn("%s\n", buffer);
>   	}
>   
> -	/* If AER capable, dump it */
> +	/* If AER capable, parse and dump it */
>   	cap = edev->aer_cap;
>   	if (cap) {
> +		eeh_parse_aer_registers(edev, cap);
> +
>   		n += scnprintf(buf+n, len-n, "pci-e AER:\n");
>   		pr_warn("EEH: PCI-E AER capability register set follows:\n");
>   


More information about the Linuxppc-dev mailing list