Skip to content

Commit 406fec2

Browse files
author
Fox Snowpatch
committed
1 parent 960c1fd commit 406fec2

7 files changed

Lines changed: 82 additions & 32 deletions

File tree

drivers/pci/pci.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2246,7 +2246,7 @@ void pcie_clear_device_status(struct pci_dev *dev)
22462246
u16 sta;
22472247

22482248
pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
2249-
pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
2249+
pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta & PCI_EXP_DEVSTA_ERR);
22502250
}
22512251
#endif
22522252

drivers/pci/pci.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -746,8 +746,10 @@ struct aer_err_info {
746746
struct pcie_tlp_log tlp; /* TLP Header */
747747
};
748748

749-
int aer_get_device_error_info(struct aer_err_info *info, int i);
749+
int aer_get_device_error_info(struct aer_err_info *info, int i,
750+
bool link_healthy);
750751
void aer_print_error(struct aer_err_info *info, int i);
752+
void aer_report_frozen_error(struct pci_dev *dev);
751753

752754
int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
753755
unsigned int tlp_len, bool flit,
@@ -771,7 +773,7 @@ struct rcec_ea {
771773
void pci_save_dpc_state(struct pci_dev *dev);
772774
void pci_restore_dpc_state(struct pci_dev *dev);
773775
void pci_dpc_init(struct pci_dev *pdev);
774-
void dpc_process_error(struct pci_dev *pdev);
776+
struct pci_dev *dpc_process_error(struct pci_dev *pdev);
775777
pci_ers_result_t dpc_reset_link(struct pci_dev *pdev);
776778
bool pci_dpc_recovered(struct pci_dev *pdev);
777779
unsigned int dpc_tlp_log_len(struct pci_dev *dev);

drivers/pci/pcie/aer.c

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1384,12 +1384,14 @@ EXPORT_SYMBOL_GPL(aer_recover_queue);
13841384
* aer_get_device_error_info - read error status from dev and store it to info
13851385
* @info: pointer to structure to store the error record
13861386
* @i: index into info->dev[]
1387+
* @link_healthy: link is healthy or not
13871388
*
13881389
* Return: 1 on success, 0 on error.
13891390
*
13901391
* Note that @info is reused among all error devices. Clear fields properly.
13911392
*/
1392-
int aer_get_device_error_info(struct aer_err_info *info, int i)
1393+
int aer_get_device_error_info(struct aer_err_info *info, int i,
1394+
bool link_healthy)
13931395
{
13941396
struct pci_dev *dev;
13951397
int type, aer;
@@ -1420,7 +1422,8 @@ int aer_get_device_error_info(struct aer_err_info *info, int i)
14201422
} else if (type == PCI_EXP_TYPE_ROOT_PORT ||
14211423
type == PCI_EXP_TYPE_RC_EC ||
14221424
type == PCI_EXP_TYPE_DOWNSTREAM ||
1423-
info->severity == AER_NONFATAL) {
1425+
info->severity == AER_NONFATAL ||
1426+
(info->severity == AER_FATAL && link_healthy)) {
14241427

14251428
/* Link is still healthy for IO reads */
14261429
pci_read_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS,
@@ -1447,17 +1450,38 @@ int aer_get_device_error_info(struct aer_err_info *info, int i)
14471450
return 1;
14481451
}
14491452

1453+
void aer_report_frozen_error(struct pci_dev *dev)
1454+
{
1455+
struct aer_err_info info;
1456+
int type = pci_pcie_type(dev);
1457+
1458+
if (type != PCI_EXP_TYPE_ENDPOINT && type != PCI_EXP_TYPE_RC_END)
1459+
return;
1460+
1461+
info.error_dev_num = 0;
1462+
info.severity = AER_FATAL;
1463+
info.level = KERN_ERR;
1464+
add_error_device(&info, dev);
1465+
1466+
if (aer_get_device_error_info(&info, 0, true)) {
1467+
pci_err(dev, "Errors reported prior to reset\n");
1468+
aer_print_error(&info, 0);
1469+
}
1470+
1471+
pci_dev_put(dev); /* pairs with pci_dev_get() in add_error_device() */
1472+
}
1473+
14501474
static inline void aer_process_err_devices(struct aer_err_info *e_info)
14511475
{
14521476
int i;
14531477

14541478
/* Report all before handling them, to not lose records by reset etc. */
14551479
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
1456-
if (aer_get_device_error_info(e_info, i))
1480+
if (aer_get_device_error_info(e_info, i, false))
14571481
aer_print_error(e_info, i);
14581482
}
14591483
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
1460-
if (aer_get_device_error_info(e_info, i))
1484+
if (aer_get_device_error_info(e_info, i, false))
14611485
handle_error_source(e_info->dev[i], e_info);
14621486
}
14631487
}

drivers/pci/pcie/dpc.c

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -260,10 +260,20 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev *dev,
260260
return 1;
261261
}
262262

263-
void dpc_process_error(struct pci_dev *pdev)
263+
/**
264+
* dpc_process_error - handle the DPC error status
265+
* @pdev: the port that experienced the containment event
266+
*
267+
* Return: the device that detected the error.
268+
*
269+
* NOTE: The device reference count is increased, the caller must decrement
270+
* the reference count by calling pci_dev_put().
271+
*/
272+
struct pci_dev *dpc_process_error(struct pci_dev *pdev)
264273
{
265274
u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
266275
struct aer_err_info info = {};
276+
struct pci_dev *err_dev;
267277

268278
pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
269279

@@ -274,11 +284,12 @@ void dpc_process_error(struct pci_dev *pdev)
274284
pci_warn(pdev, "containment event, status:%#06x: unmasked uncorrectable error detected\n",
275285
status);
276286
if (dpc_get_aer_uncorrect_severity(pdev, &info) &&
277-
aer_get_device_error_info(&info, 0)) {
287+
aer_get_device_error_info(&info, 0, false)) {
278288
aer_print_error(&info, 0);
279289
pci_aer_clear_nonfatal_status(pdev);
280290
pci_aer_clear_fatal_status(pdev);
281291
}
292+
err_dev = pci_dev_get(pdev);
282293
break;
283294
case PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE:
284295
case PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE:
@@ -290,6 +301,8 @@ void dpc_process_error(struct pci_dev *pdev)
290301
"ERR_FATAL" : "ERR_NONFATAL",
291302
pci_domain_nr(pdev->bus), PCI_BUS_NUM(source),
292303
PCI_SLOT(source), PCI_FUNC(source));
304+
err_dev = pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
305+
PCI_BUS_NUM(source), source & 0xff);
293306
break;
294307
case PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT:
295308
ext_reason = status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT;
@@ -304,8 +317,11 @@ void dpc_process_error(struct pci_dev *pdev)
304317
if (ext_reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO &&
305318
pdev->dpc_rp_extensions)
306319
dpc_process_rp_pio_error(pdev);
320+
err_dev = pci_dev_get(pdev);
307321
break;
308322
}
323+
324+
return err_dev;
309325
}
310326

311327
static void pci_clear_surpdn_errors(struct pci_dev *pdev)
@@ -361,21 +377,22 @@ static bool dpc_is_surprise_removal(struct pci_dev *pdev)
361377

362378
static irqreturn_t dpc_handler(int irq, void *context)
363379
{
364-
struct pci_dev *pdev = context;
380+
struct pci_dev *err_port = context, *err_dev;
365381

366382
/*
367383
* According to PCIe r6.0 sec 6.7.6, errors are an expected side effect
368384
* of async removal and should be ignored by software.
369385
*/
370-
if (dpc_is_surprise_removal(pdev)) {
371-
dpc_handle_surprise_removal(pdev);
386+
if (dpc_is_surprise_removal(err_port)) {
387+
dpc_handle_surprise_removal(err_port);
372388
return IRQ_HANDLED;
373389
}
374390

375-
dpc_process_error(pdev);
391+
err_dev = dpc_process_error(err_port);
376392

377393
/* We configure DPC so it only triggers on ERR_FATAL */
378-
pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link);
394+
pcie_do_recovery(err_dev, pci_channel_io_frozen, dpc_reset_link);
395+
pci_dev_put(err_dev);
379396

380397
return IRQ_HANDLED;
381398
}

drivers/pci/pcie/edr.c

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ static int acpi_send_edr_status(struct pci_dev *pdev, struct pci_dev *edev,
150150

151151
static void edr_handle_event(acpi_handle handle, u32 event, void *data)
152152
{
153-
struct pci_dev *pdev = data, *edev;
153+
struct pci_dev *pdev = data, *err_port, *err_dev;
154154
pci_ers_result_t estate = PCI_ERS_RESULT_DISCONNECT;
155155
u16 status;
156156

@@ -169,36 +169,37 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data)
169169
* may be that port or a parent of it (PCI Firmware r3.3, sec
170170
* 4.6.13).
171171
*/
172-
edev = acpi_dpc_port_get(pdev);
173-
if (!edev) {
172+
err_port = acpi_dpc_port_get(pdev);
173+
if (!err_port) {
174174
pci_err(pdev, "Firmware failed to locate DPC port\n");
175175
return;
176176
}
177177

178-
pci_dbg(pdev, "Reported EDR dev: %s\n", pci_name(edev));
178+
pci_dbg(pdev, "Reported EDR dev: %s\n", pci_name(err_port));
179179

180180
/* If port does not support DPC, just send the OST */
181-
if (!edev->dpc_cap) {
182-
pci_err(edev, FW_BUG "This device doesn't support DPC\n");
181+
if (!err_port->dpc_cap) {
182+
pci_err(err_port, FW_BUG "This device doesn't support DPC\n");
183183
goto send_ost;
184184
}
185185

186186
/* Check if there is a valid DPC trigger */
187-
pci_read_config_word(edev, edev->dpc_cap + PCI_EXP_DPC_STATUS, &status);
187+
pci_read_config_word(err_port, err_port->dpc_cap + PCI_EXP_DPC_STATUS, &status);
188188
if (!(status & PCI_EXP_DPC_STATUS_TRIGGER)) {
189-
pci_err(edev, "Invalid DPC trigger %#010x\n", status);
189+
pci_err(err_port, "Invalid DPC trigger %#010x\n", status);
190190
goto send_ost;
191191
}
192192

193-
dpc_process_error(edev);
194-
pci_aer_raw_clear_status(edev);
193+
err_dev = dpc_process_error(err_port);
194+
pci_aer_raw_clear_status(err_port);
195195

196196
/*
197197
* Irrespective of whether the DPC event is triggered by ERR_FATAL
198198
* or ERR_NONFATAL, since the link is already down, use the FATAL
199199
* error recovery path for both cases.
200200
*/
201-
estate = pcie_do_recovery(edev, pci_channel_io_frozen, dpc_reset_link);
201+
estate = pcie_do_recovery(err_dev, pci_channel_io_frozen, dpc_reset_link);
202+
pci_dev_put(err_dev);
202203

203204
send_ost:
204205

@@ -207,15 +208,15 @@ static void edr_handle_event(acpi_handle handle, u32 event, void *data)
207208
* to firmware. If not successful, send _OST(0xF, BDF << 16 | 0x81).
208209
*/
209210
if (estate == PCI_ERS_RESULT_RECOVERED) {
210-
pci_dbg(edev, "DPC port successfully recovered\n");
211-
pcie_clear_device_status(edev);
212-
acpi_send_edr_status(pdev, edev, EDR_OST_SUCCESS);
211+
pci_dbg(err_port, "DPC port successfully recovered\n");
212+
pcie_clear_device_status(err_port);
213+
acpi_send_edr_status(pdev, err_port, EDR_OST_SUCCESS);
213214
} else {
214-
pci_dbg(edev, "DPC port recovery failed\n");
215-
acpi_send_edr_status(pdev, edev, EDR_OST_FAILED);
215+
pci_dbg(err_port, "DPC port recovery failed\n");
216+
acpi_send_edr_status(pdev, err_port, EDR_OST_FAILED);
216217
}
217218

218-
pci_dev_put(edev);
219+
pci_dev_put(err_port);
219220
}
220221

221222
void pci_acpi_add_edr_notifier(struct pci_dev *pdev)

drivers/pci/pcie/err.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,11 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
253253
pci_warn(bridge, "subordinate device reset failed\n");
254254
goto failed;
255255
}
256+
257+
/* Link recovered, report fatal errors of RCiEP or EP */
258+
if (state == pci_channel_io_frozen)
259+
aer_report_frozen_error(dev);
260+
256261
}
257262

258263
if (status == PCI_ERS_RESULT_NEED_RESET) {
@@ -280,7 +285,7 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
280285
*/
281286
if (host->native_aer || pcie_ports_native) {
282287
pcie_clear_device_status(dev);
283-
pci_aer_clear_nonfatal_status(dev);
288+
pci_aer_raw_clear_status(dev);
284289
}
285290

286291
pci_walk_bridge(bridge, pci_pm_runtime_put, NULL);

include/uapi/linux/pci_regs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,7 @@
534534
#define PCI_EXP_DEVSTA_NFED 0x0002 /* Non-Fatal Error Detected */
535535
#define PCI_EXP_DEVSTA_FED 0x0004 /* Fatal Error Detected */
536536
#define PCI_EXP_DEVSTA_URD 0x0008 /* Unsupported Request Detected */
537+
#define PCI_EXP_DEVSTA_ERR 0xf /* Error bits */
537538
#define PCI_EXP_DEVSTA_AUXPD 0x0010 /* AUX Power Detected */
538539
#define PCI_EXP_DEVSTA_TRPND 0x0020 /* Transactions Pending */
539540
#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12 /* v1 endpoints without link end here */

0 commit comments

Comments
 (0)