[PATCH v4 3/3] PCI/AER: Report fatal errors of RCiEP and EP if link recoverd
Shuai Xue
xueshuai at linux.alibaba.com
Mon Mar 3 15:33:59 AEDT 2025
在 2025/3/3 11:43, Sathyanarayanan Kuppuswamy 写道:
>
> On 2/16/25 6:42 PM, Shuai Xue wrote:
>> The AER driver has historically avoided reading the configuration space of
>> an endpoint or RCiEP that reported a fatal error, considering the link to
>> that device unreliable. Consequently, when a fatal error occurs, the AER
>> and DPC drivers do not report specific error types, resulting in logs like:
>>
>> pcieport 0000:30:03.0: EDR: EDR event received
>> pcieport 0000:30:03.0: DPC: containment event, status:0x0005 source:0x3400
>> pcieport 0000:30:03.0: DPC: ERR_FATAL detected
>> pcieport 0000:30:03.0: AER: broadcast error_detected message
>> nvme nvme0: frozen state error detected, reset controller
>> nvme 0000:34:00.0: ready 0ms after DPC
>> pcieport 0000:30:03.0: AER: broadcast slot_reset message
>>
>> AER status registers are sticky and Write-1-to-clear. If the link recovered
>> after hot reset, we can still safely access AER status of the error device.
>> In such case, report fatal errors which helps to figure out the error root
>> case.
>>
>> After this patch, the logs like:
>>
>> pcieport 0000:30:03.0: EDR: EDR event received
>> pcieport 0000:30:03.0: DPC: containment event, status:0x0005 source:0x3400
>> pcieport 0000:30:03.0: DPC: ERR_FATAL detected
>> pcieport 0000:30:03.0: AER: broadcast error_detected message
>> nvme nvme0: frozen state error detected, reset controller
>> pcieport 0000:30:03.0: waiting 100 ms for downstream link, after activation
>> nvme 0000:34:00.0: ready 0ms after DPC
>> nvme 0000:34:00.0: PCIe Bus Error: severity=Uncorrectable (Fatal), type=Data Link Layer, (Receiver ID)
>> nvme 0000:34:00.0: device [144d:a804] error status/mask=00000010/00504000
>> nvme 0000:34:00.0: [ 4] DLP (First)
>> pcieport 0000:30:03.0: AER: broadcast slot_reset message
>
> IMO, above info about device error details is more of a debug info. Since the
> main use of this info use to understand more details about the recovered
> DPC error. So I think is better to print with debug tag. Lets see what others
> think.
>
> Code wise, looks fine to me.
thanks, looking forward to more feedback.
>
>
>
>> Signed-off-by: Shuai Xue <xueshuai at linux.alibaba.com>
>> ---
>> drivers/pci/pci.h | 3 ++-
>> drivers/pci/pcie/aer.c | 11 +++++++----
>> drivers/pci/pcie/dpc.c | 2 +-
>> drivers/pci/pcie/err.c | 9 +++++++++
>> 4 files changed, 19 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>> index 870d2fbd6ff2..e852fa58b250 100644
>> --- a/drivers/pci/pci.h
>> +++ b/drivers/pci/pci.h
>> @@ -549,7 +549,8 @@ struct aer_err_info {
>> struct pcie_tlp_log tlp; /* TLP Header */
>> };
>> -int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
>> +int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info,
>> + bool link_healthy);
>> void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
>> int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index 508474e17183..bfb67db074f0 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -1197,12 +1197,14 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
>> * aer_get_device_error_info - read error status from dev and store it to info
>> * @dev: pointer to the device expected to have a error record
>> * @info: pointer to structure to store the error record
>> + * @link_healthy: link is healthy or not
>> *
>> * Return 1 on success, 0 on error.
>> *
>> * Note that @info is reused among all error devices. Clear fields properly.
>> */
>> -int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
>> +int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info,
>> + bool link_healthy)
>> {
>> int type = pci_pcie_type(dev);
>> int aer = dev->aer_cap;
>> @@ -1226,7 +1228,8 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
>> } else if (type == PCI_EXP_TYPE_ROOT_PORT ||
>> type == PCI_EXP_TYPE_RC_EC ||
>> type == PCI_EXP_TYPE_DOWNSTREAM ||
>> - info->severity == AER_NONFATAL) {
>> + info->severity == AER_NONFATAL ||
>> + (info->severity == AER_FATAL && link_healthy)) {
>> /* Link is still healthy for IO reads */
>> pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS,
>> @@ -1258,11 +1261,11 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info)
>> /* Report all before handle them, not to lost records by reset etc. */
>> for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
>> - if (aer_get_device_error_info(e_info->dev[i], e_info))
>> + if (aer_get_device_error_info(e_info->dev[i], e_info, false))
>> aer_print_error(e_info->dev[i], e_info);
>> }
>> for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
>> - if (aer_get_device_error_info(e_info->dev[i], e_info))
>> + if (aer_get_device_error_info(e_info->dev[i], e_info, false))
>> handle_error_source(e_info->dev[i], e_info);
>> }
>> }
>> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
>> index ea3ea989afa7..2d3dd831b755 100644
>> --- a/drivers/pci/pcie/dpc.c
>> +++ b/drivers/pci/pcie/dpc.c
>> @@ -303,7 +303,7 @@ struct pci_dev *dpc_process_error(struct pci_dev *pdev)
>> dpc_process_rp_pio_error(pdev);
>> else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
>> dpc_get_aer_uncorrect_severity(pdev, &info) &&
>> - aer_get_device_error_info(pdev, &info)) {
>> + aer_get_device_error_info(pdev, &info, false)) {
>> aer_print_error(pdev, &info);
>> pci_aer_clear_nonfatal_status(pdev);
>> pci_aer_clear_fatal_status(pdev);
>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
>> index 31090770fffc..462577b8d75a 100644
>> --- a/drivers/pci/pcie/err.c
>> +++ b/drivers/pci/pcie/err.c
>> @@ -196,6 +196,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>> struct pci_dev *bridge;
>> pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
>> struct pci_host_bridge *host = pci_find_host_bridge(dev->bus);
>> + struct aer_err_info info;
>> /*
>> * If the error was detected by a Root Port, Downstream Port, RCEC,
>> @@ -223,6 +224,13 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>> pci_warn(bridge, "subordinate device reset failed\n");
>> goto failed;
>> }
>> +
>> + info.severity = AER_FATAL;
>> + /* Link recovered, report fatal errors of RCiEP or EP */
>> + if ((type == PCI_EXP_TYPE_ENDPOINT ||
>> + type == PCI_EXP_TYPE_RC_END) &&
>> + aer_get_device_error_info(dev, &info, true))
>> + aer_print_error(dev, &info);
>> } else {
>> pci_walk_bridge(bridge, report_normal_detected, &status);
>> }
>> @@ -259,6 +267,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>> if (host->native_aer || pcie_ports_native) {
>> pcie_clear_device_status(dev);
>> pci_aer_clear_nonfatal_status(dev);
>> + pci_aer_clear_fatal_status(dev);
>
> Add some info about above change in the commit log.
Will do.
Thanks.
Shuai
More information about the Linuxppc-dev
mailing list