From 63d230184da3f3784ecf968486e1fa20ecbeece6 Mon Sep 17 00:00:00 2001 From: Lukasz Raczylo Date: Fri, 24 Apr 2026 00:00:00 +0000 Subject: [PATCH 1/3] net: macb: flush PCIe posted write after TSTART doorbell macb_start_xmit() and macb_tx_restart() kick transmission by OR-ing MACB_BIT(TSTART) into NCR. On PCIe-attached macb instances (BCM2712 + RP1 PCIe south bridge on Raspberry Pi 5 is the setup we have in front of us), writes to NCR are posted PCIe writes: they are not guaranteed to reach the device before the issuing CPU returns. An existing source-level comment at the TSTART site acknowledges that such writes can be lost under some conditions: /* TSTART write might get dropped, so make the IRQ retrigger * a buffer read */ and arms a recovery handshake via queue->tx_pending / queue->txubr_pending that runs on the next TCOMP interrupt. That recovery path depends on a subsequent TCOMP actually firing. If the TSTART write never reaches the MAC, no TX begins, no TCOMP completion arrives, and the ring remains quiescent without any kernel-visible indication. Add a read-back of NCR after each TSTART write in macb_start_xmit() and macb_tx_restart(). The read is an architected PCIe read barrier for earlier posted writes on the same path; it ensures the doorbell has reached the MAC before the functions return. The existing tx_pending / txubr_pending handshake is left in place unchanged -- it remains the correct recovery for any other reason the MAC could silently fail to start TX. We do not have direct hardware evidence that TSTART is being lost on the RP1 path. This patch is one of a three-patch series ("candidate fixes for silent TX stall on BCM2712/RP1"); see the cover letter for context. We have verified it compiles and applies cleanly; runtime verification is pending. Link: https://github.com/cilium/cilium/issues/43198 Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877 Signed-off-by: Lukasz Raczylo --- drivers/net/ethernet/cadence/macb_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 606e1151ef71d..bf5beeb3a349e 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1949,6 +1949,12 @@ static void macb_tx_restart(struct macb_queue *queue) spin_lock(&bp->lock); macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART)); + /* Flush the PCIe posted-write queue so the TSTART doorbell + * reliably reaches the MAC. Without this, the write can sit + * in the fabric and the MAC never advances, causing a silent + * TX stall. + */ + (void)macb_readl(bp, NCR); spin_unlock(&bp->lock); out_tx_ptr_unlock: @@ -2630,6 +2636,10 @@ static netdev_tx_t macb_start_xmit(struct sk_buff *skb, struct net_device *dev) queue->tx_pending = 1; macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART)); + /* Flush the PCIe posted-write queue; see the comment in + * macb_tx_restart() for the reasoning. + */ + (void)macb_readl(bp, NCR); spin_unlock(&bp->lock); if (CIRC_SPACE(queue->tx_head, queue->tx_tail, bp->tx_ring_size) < 1) From 3ccf780ce058a026ba1e860424193fdfdb242ee5 Mon Sep 17 00:00:00 2001 From: Lukasz Raczylo Date: Fri, 24 Apr 2026 00:00:00 +0000 Subject: [PATCH 2/3] net: macb: re-check ISR after IER re-enable in macb_tx_poll macb_tx_poll() runs with TCOMP masked, drains the TX ring, then calls napi_complete_done() and re-enables TCOMP via IER. An existing comment in the function notes: /* Packet completions only seem to propagate to raise * interrupts when interrupts are enabled at the time, so if * packets were sent while interrupts were disabled, * they will not cause another interrupt to be generated when * interrupts are re-enabled. */ and mitigates this by calling macb_tx_complete_pending(), which inspects driver-visible ring state (descriptor->ctrl, after rmb()) and reschedules NAPI if a completion is observable in memory. On PCIe-attached parts (BCM2712 + RP1 on Raspberry Pi 5 is the setup we have in front of us), the descriptor DMA write that sets TX_USED may not have retired to system memory at the point macb_tx_complete_pending() runs. The rmb() synchronises the CPU view of earlier CPU writes; it is not sufficient to retire an in-flight peripheral DMA write. Under that ordering the in-memory descriptor can still read TX_USED=0 when the hardware has in fact completed the frame; the check returns false; NAPI exits; the quirk above prevents the re-enabled IER from re-firing; the ring goes quiescent. Add an explicit ISR read after the IER write. The MMIO read serves two independent purposes: (1) It is an architected PCIe read barrier for earlier peripheral-originated DMA writes on the same path, so a subsequent macb_tx_complete_pending() observes any TX_USED write that was in flight at the time of the barrier. (2) It samples the hardware ISR directly, so a TCOMP bit that the hardware set while TCOMP was masked is visible here, independently of whether the descriptor DMA has retired. If either signal indicates pending work, reschedule NAPI via the same path as the existing check. This patch addresses one of three candidate races for the silent TX stall described in the cover letter. Whether it is sufficient by itself, or whether it requires the PCIe posted-write flush in patch 1/3 to cover the observed behaviour, we have not yet verified at runtime. Link: https://github.com/cilium/cilium/issues/43198 Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877 Signed-off-by: Lukasz Raczylo --- drivers/net/ethernet/cadence/macb_main.c | 27 +++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index bf5beeb3a349e..fbc7893b017b7 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1999,17 +1999,24 @@ static int macb_tx_poll(struct napi_struct *napi, int budget) if (work_done < budget && napi_complete_done(napi, work_done)) { queue_writel(queue, IER, MACB_BIT(TCOMP)); - /* Packet completions only seem to propagate to raise - * interrupts when interrupts are enabled at the time, so if - * packets were sent while interrupts were disabled, - * they will not cause another interrupt to be generated when - * interrupts are re-enabled. - * Check for this case here to avoid losing a wakeup. This can - * potentially race with the interrupt handler doing the same - * actions if an interrupt is raised just after enabling them, - * but this should be harmless. + /* TCOMP events that fire while the interrupt is masked do + * not re-fire when IER is re-enabled. Catch this two ways + * to avoid losing a wakeup: + * + * (1) Read ISR -- catches completions the hardware flagged + * but that we did not see as an interrupt. The MMIO + * read doubles as a PCIe read barrier, flushing any + * in-flight descriptor TX_USED DMA writes into memory. + * (2) macb_tx_complete_pending() inspects the ring after + * that flush, catching a descriptor whose TX_USED is + * now visible as a result of the barrier. + * + * This can race with the interrupt handler taking the same + * path if an interrupt fires just after the IER write; + * rescheduling NAPI in that case is harmless. */ - if (macb_tx_complete_pending(queue)) { + if ((queue_readl(queue, ISR) & MACB_BIT(TCOMP)) || + macb_tx_complete_pending(queue)) { queue_writel(queue, IDR, MACB_BIT(TCOMP)); if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) queue_writel(queue, ISR, MACB_BIT(TCOMP)); From 8ea87c96f1a6f8f7975fb4805099f5ba9bd9eb32 Mon Sep 17 00:00:00 2001 From: Lukasz Raczylo Date: Fri, 24 Apr 2026 00:00:00 +0000 Subject: [PATCH 3/3] net: macb: add TX stall watchdog as defence-in-depth safety net Patches 1/3 and 2/3 address two candidate races that could lead to a TCOMP completion being missed on PCIe-attached macb instances. This patch adds a defence-in-depth safety net, in case a further race remains that we have not identified. The watchdog is a per-queue delayed_work that runs once per second. Movement is tracked via a tx_stall_tail_moved boolean: macb_tx_complete() sets it under tx_ptr_lock whenever tx_tail advances, and the watchdog clears it under the same lock at each tick. If the ring is non-empty (tx_head != tx_tail) and the boolean was still false at the next tick, the watchdog calls macb_tx_restart(). A boolean is used in preference to snapshotting tx_tail and comparing across ticks, because per-queue ring indices are bounded and reused; under sustained load a snapshot comparison can false-positive when the index happens to land on the same value between two ticks. Both writes share tx_ptr_lock with the existing tx_head / tx_tail updates, so no atomic is required. No new recovery logic is introduced. macb_tx_restart() already exists in this file, is correctly locked (tx_ptr_lock, bp->lock), and verifies that the hardware's TBQP is behind the driver's head index before re-asserting TSTART. On a healthy ring it is a no-op at the hardware level; the watchdog only supplies the missing trigger. On a healthy queue the per-tick cost is one spin_lock_irqsave() / spin_unlock_irqrestore(), one branch, and one byte store. The delayed_work is only scheduled between macb_open() and macb_close(), and is cancelled synchronously on close. Context for submission: on our 24-node Raspberry Pi 5 fleet, before this series, an out-of-band user-space watchdog (monitoring tx_packets from /sys/class/net/.../statistics and toggling the link down/up when it froze) was required to keep nodes usable. We include this kernel-side watchdog as a cleaner in-kernel equivalent for any residual stall that patches 1 and 2 do not cover. We are willing to drop this patch if the view is that 1 and 2 should stand alone. Link: https://github.com/cilium/cilium/issues/43198 Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877 Signed-off-by: Lukasz Raczylo --- drivers/net/ethernet/cadence/macb.h | 11 ++++ drivers/net/ethernet/cadence/macb_main.c | 65 ++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h index 8c344f70849fb..bdbafe09714d4 100644 --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -1294,6 +1294,17 @@ struct macb_queue { struct work_struct tx_error_task; bool txubr_pending; bool tx_pending; + + /* TX stall watchdog -- see macb_tx_stall_watchdog() in macb_main.c. + * tx_stall_tail_moved is set by macb_tx_complete() under tx_ptr_lock + * whenever tx_tail advances, and cleared by the watchdog tick on the + * same lock. A bool avoids the index-aliasing false-positive that a + * snapshot-of-tx_tail comparison would have when the ring index space + * happens to wrap to the same value between two ticks. + */ + struct delayed_work tx_stall_watchdog_work; + bool tx_stall_tail_moved; + struct napi_struct napi_tx; dma_addr_t rx_ring_dma; diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index fbc7893b017b7..423aac02a0d47 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1505,6 +1505,8 @@ static int macb_tx_complete(struct macb_queue *queue, int budget) packets, bytes); queue->tx_tail = tail; + if (packets) + queue->tx_stall_tail_moved = true; if (__netif_subqueue_stopped(bp->dev, queue_index) && CIRC_CNT(queue->tx_head, queue->tx_tail, bp->tx_ring_size) <= MACB_TX_WAKEUP_THRESH(bp)) @@ -2028,6 +2030,63 @@ static int macb_tx_poll(struct napi_struct *napi, int budget) return work_done; } +#define MACB_TX_STALL_INTERVAL_MS 1000 + +/* TX stall watchdog. + * + * Defence-in-depth against lost TCOMP interrupts. macb already has a + * recovery chain (tx_pending -> txubr_pending -> macb_tx_restart()) + * that fires on TCOMP; if TCOMP itself is lost the TX ring stalls + * silently until something else kicks TSTART. This watchdog runs + * once per second per queue and calls macb_tx_restart() if the ring + * is non-empty and tx_tail has not advanced since the previous tick. + * + * Movement is tracked via the tx_stall_tail_moved boolean rather + * than by snapshotting tx_tail. Per-queue ring indices are bounded + * (and reused), so a snapshot comparison can false-positive when the + * index happens to land on the same value between two ticks under + * sustained load. The boolean is set by macb_tx_complete() whenever + * tx_tail advances and cleared by this watchdog after each tick; + * both writes are under tx_ptr_lock, so no atomic is required. + * + * macb_tx_restart() already checks the hardware's TBQP against the + * driver's head index before re-asserting TSTART, so on a healthy + * ring this is a no-op at the hardware level. The watchdog only + * adds the missing trigger. + */ +static void macb_tx_stall_watchdog(struct work_struct *work) +{ + struct macb_queue *queue = container_of(to_delayed_work(work), + struct macb_queue, + tx_stall_watchdog_work); + struct macb *bp = queue->bp; + unsigned int cur_tail, cur_head; + bool stalled = false; + unsigned long flags; + + if (!netif_running(bp->dev)) + return; + + spin_lock_irqsave(&queue->tx_ptr_lock, flags); + cur_tail = queue->tx_tail; + cur_head = queue->tx_head; + if (cur_head != cur_tail && !queue->tx_stall_tail_moved) + stalled = true; + queue->tx_stall_tail_moved = false; + spin_unlock_irqrestore(&queue->tx_ptr_lock, flags); + + if (stalled) { + netdev_warn_once(bp->dev, + "TX stall detected on queue %u (tail=%u head=%u); re-kicking TSTART\n", + (unsigned int)(queue - bp->queues), + cur_tail, cur_head); + macb_tx_restart(queue); + } + + schedule_delayed_work(&queue->tx_stall_watchdog_work, + msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS)); +} + static void macb_hresp_error_task(struct work_struct *work) { struct macb *bp = from_work(bp, work, hresp_err_bh_work); @@ -3294,6 +3353,9 @@ static int macb_open(struct net_device *dev) for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) { napi_enable(&queue->napi_rx); napi_enable(&queue->napi_tx); + queue->tx_stall_tail_moved = true; + schedule_delayed_work(&queue->tx_stall_watchdog_work, + msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS)); } macb_init_hw(bp); @@ -3340,6 +3402,7 @@ static int macb_close(struct net_device *dev) for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) { napi_disable(&queue->napi_rx); napi_disable(&queue->napi_tx); + cancel_delayed_work_sync(&queue->tx_stall_watchdog_work); netdev_tx_reset_queue(netdev_get_tx_queue(dev, q)); } @@ -4938,6 +5001,8 @@ static int macb_init(struct platform_device *pdev) } INIT_WORK(&queue->tx_error_task, macb_tx_error_task); + INIT_DELAYED_WORK(&queue->tx_stall_watchdog_work, + macb_tx_stall_watchdog); q++; }