[PATCH] powerpc/eeh: parse AER registers
Narayana Murty N
nnmlinux at linux.ibm.com
Tue Jul 22 19:01:39 AEST 2025
On 2025/07/03 09:15 AM, Ganesh Goudar wrote:
> parse AER uncorrectable and correctable error status
> registers to print error type and severity.
>
> output looks like
> EEH:AER Uncorrectable Error
> EEH:AER Error Type: Data Link Protocol Error [Fatal]
>
> Signed-off-by: Ganesh Goudar <ganeshgr at linux.ibm.com>
> ---
> arch/powerpc/kernel/eeh.c | 84 ++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 83 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 83fe99861eb1..03e1e2eeb679 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -139,6 +139,49 @@ struct eeh_stats {
>
> static struct eeh_stats eeh_stats;
>
> +static const char * const aer_uncor_errors[] = {
> + "Undefined",
> + "Undefined",
> + "Undefined",
> + "Undefined",
> + "Data Link Protocol",
> + "Surprise Down",
> + "Poisoned TLP",
> + "Flow Control Protocol",
> + "Completion Timeout",
> + "Completer Abort",
> + "Unexpected Completion",
> + "Receiver Overflow",
> + "Malformed TLP",
> + "ECRC Error",
> + "Unsupported Request",
> + "ACS Violation",
> + "Uncorrectable Internal Error",
> + "MC Blocked TLP",
> + "AtomicOp Egress Blocked",
> + "TLPPrefix Blocked",
> + "Poisoned TLP Egress Blocked"
> +};
> +
> +static const char * const aer_cor_errors[] = {
> + "Receiver Error",
> + "Undefined",
> + "Undefined",
> + "Undefined",
> + "Undefined",
> + "Undefined",
> + "Bad TLP",
> + "Bad DLLP",
> + "Replay Num Rollover",
> + "Undefined",
> + "Undefined",
> + "Undefined",
> + "Replay Timer Timeout",
> + "Advisory Non-Fatal Error",
> + "Corrected Internal Error",
> + "Header Log Overflow",
> +};
> +
> static int __init eeh_setup(char *str)
> {
> if (!strcmp(str, "off"))
> @@ -160,6 +203,43 @@ void eeh_show_enabled(void)
> pr_info("EEH: No capable adapters found: recovery disabled.\n");
> }
>
> +static void eeh_parse_aer_registers(struct eeh_dev *edev, int cap)
> +{
> + int i;
> + const char *error_type;
> + u32 uncor_status, uncor_severity, cor_status;
> +
> + eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_STATUS, 4, &uncor_status);
> + eeh_ops->read_config(edev, cap + PCI_ERR_UNCOR_SEVER, 4, &uncor_severity);
> + eeh_ops->read_config(edev, cap + PCI_ERR_COR_STATUS, 4, &cor_status);
> +
> + if (!uncor_status && !cor_status)
> + return;
> +
> + if (uncor_status) {
> + pr_err("EEH:AER Uncorrectable Error\n");
> + for (i = 0; i < ARRAY_SIZE(aer_uncor_errors); i++) {
> + if (uncor_status & (1 << i)) {
> + error_type = (i < ARRAY_SIZE(aer_uncor_errors))
> + ? aer_uncor_errors[i] : "Unknown";
> + pr_err("EEH:AER Error Type: %s [%s]\n", error_type,
> + (uncor_severity & (1 << i)) ? "Fatal" : "Non-Fatal");
> + }
> + }
> + }
> +
> + if (cor_status) {
> + pr_err("EEH:AER Correctable Error\n");
> + for (i = 0; i < ARRAY_SIZE(aer_cor_errors); i++) {
> + if (cor_status & (1 << i)) {
> + error_type = (i < ARRAY_SIZE(aer_cor_errors))
> + ? aer_cor_errors[i] : "Unknown";
> + pr_err("EEH:AER Error Type: %s\n", error_type);
> + }
> + }
> + }
> +}
> +
I think masks are essential to understand which errors are enabled, filtered,
or significant in the given context. Ignoring them could result in misinterpreting
disabled error bits, producing false positives or unintended error recovery behavior.
Reading and applying the relevant mask when interpreting status values.
Ensuring masked-off bits are either ignored or logged appropriately.
Regards,
Narayana Murty N.
> /*
> * This routine captures assorted PCI configuration space data
> * for the indicated PCI device, and puts them into a buffer
> @@ -237,9 +317,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
> pr_warn("%s\n", buffer);
> }
>
> - /* If AER capable, dump it */
> + /* If AER capable, parse and dump it */
> cap = edev->aer_cap;
> if (cap) {
> + eeh_parse_aer_registers(edev, cap);
> +
> n += scnprintf(buf+n, len-n, "pci-e AER:\n");
> pr_warn("EEH: PCI-E AER capability register set follows:\n");
>
> --
> 2.48.1
>
>
>
More information about the Linuxppc-dev
mailing list