From 63d230184da3f3784ecf968486e1fa20ecbeece6 Mon Sep 17 00:00:00 2001
From: Lukasz Raczylo <lukasz@raczylo.com>
Date: Fri, 24 Apr 2026 00:00:00 +0000
Subject: [PATCH 1/3] net: macb: flush PCIe posted write after TSTART doorbell

macb_start_xmit() and macb_tx_restart() kick transmission by
OR-ing MACB_BIT(TSTART) into NCR.  On PCIe-attached macb instances
(BCM2712 + RP1 PCIe south bridge on Raspberry Pi 5 is the setup we
have in front of us), writes to NCR are posted PCIe writes: they
are not guaranteed to reach the device before the issuing CPU
returns.  An existing source-level comment at the TSTART site
acknowledges that such writes can be lost under some conditions:

	/* TSTART write might get dropped, so make the IRQ retrigger
	 * a buffer read */

and arms a recovery handshake via queue->tx_pending /
queue->txubr_pending that runs on the next TCOMP interrupt.  That
recovery path depends on a subsequent TCOMP actually firing.  If
the TSTART write never reaches the MAC, no TX begins, no TCOMP
completion arrives, and the ring remains quiescent without any
kernel-visible indication.

Add a read-back of NCR after each TSTART write in macb_start_xmit()
and macb_tx_restart().  The read is an architected PCIe read
barrier for earlier posted writes on the same path; it ensures the
doorbell has reached the MAC before the functions return.

The existing tx_pending / txubr_pending handshake is left in place
unchanged -- it remains the correct recovery for any other reason
the MAC could silently fail to start TX.

We do not have direct hardware evidence that TSTART is being lost
on the RP1 path.  This patch is one of a three-patch series
("candidate fixes for silent TX stall on BCM2712/RP1"); see the
cover letter for context.  We have verified it compiles and
applies cleanly; runtime verification is pending.

Link: https://github.com/cilium/cilium/issues/43198
Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
Signed-off-by: Lukasz Raczylo <lukasz@raczylo.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 606e1151ef71d..bf5beeb3a349e 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1949,6 +1949,12 @@ static void macb_tx_restart(struct macb_queue *queue)
 
 	spin_lock(&bp->lock);
 	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+	/* Flush the PCIe posted-write queue so the TSTART doorbell
+	 * reliably reaches the MAC.  Without this, the write can sit
+	 * in the fabric and the MAC never advances, causing a silent
+	 * TX stall.
+	 */
+	(void)macb_readl(bp, NCR);
 	spin_unlock(&bp->lock);
 
 out_tx_ptr_unlock:
@@ -2630,6 +2636,10 @@ static netdev_tx_t macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		queue->tx_pending = 1;
 
 	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+	/* Flush the PCIe posted-write queue; see the comment in
+	 * macb_tx_restart() for the reasoning.
+	 */
+	(void)macb_readl(bp, NCR);
 	spin_unlock(&bp->lock);
 
 	if (CIRC_SPACE(queue->tx_head, queue->tx_tail, bp->tx_ring_size) < 1)

From 3ccf780ce058a026ba1e860424193fdfdb242ee5 Mon Sep 17 00:00:00 2001
From: Lukasz Raczylo <lukasz@raczylo.com>
Date: Fri, 24 Apr 2026 00:00:00 +0000
Subject: [PATCH 2/3] net: macb: re-check ISR after IER re-enable in
 macb_tx_poll

macb_tx_poll() runs with TCOMP masked, drains the TX ring, then
calls napi_complete_done() and re-enables TCOMP via IER.  An
existing comment in the function notes:

	/* Packet completions only seem to propagate to raise
	 * interrupts when interrupts are enabled at the time, so if
	 * packets were sent while interrupts were disabled,
	 * they will not cause another interrupt to be generated when
	 * interrupts are re-enabled.
	 */

and mitigates this by calling macb_tx_complete_pending(), which
inspects driver-visible ring state (descriptor->ctrl, after rmb())
and reschedules NAPI if a completion is observable in memory.

On PCIe-attached parts (BCM2712 + RP1 on Raspberry Pi 5 is the
setup we have in front of us), the descriptor DMA write that sets
TX_USED may not have retired to system memory at the point
macb_tx_complete_pending() runs.  The rmb() synchronises the CPU
view of earlier CPU writes; it is not sufficient to retire an
in-flight peripheral DMA write.  Under that ordering the in-memory
descriptor can still read TX_USED=0 when the hardware has in fact
completed the frame; the check returns false; NAPI exits; the
quirk above prevents the re-enabled IER from re-firing; the ring
goes quiescent.

Add an explicit ISR read after the IER write.  The MMIO read
serves two independent purposes:

  (1) It is an architected PCIe read barrier for earlier
      peripheral-originated DMA writes on the same path, so a
      subsequent macb_tx_complete_pending() observes any TX_USED
      write that was in flight at the time of the barrier.

  (2) It samples the hardware ISR directly, so a TCOMP bit that
      the hardware set while TCOMP was masked is visible here,
      independently of whether the descriptor DMA has retired.

If either signal indicates pending work, reschedule NAPI via the
same path as the existing check.

This patch addresses one of three candidate races for the silent
TX stall described in the cover letter.  Whether it is sufficient
by itself, or whether it requires the PCIe posted-write flush in
patch 1/3 to cover the observed behaviour, we have not yet
verified at runtime.

Link: https://github.com/cilium/cilium/issues/43198
Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
Signed-off-by: Lukasz Raczylo <lukasz@raczylo.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 27 +++++++++++++++---------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index bf5beeb3a349e..fbc7893b017b7 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1999,17 +1999,24 @@ static int macb_tx_poll(struct napi_struct *napi, int budget)
 	if (work_done < budget && napi_complete_done(napi, work_done)) {
 		queue_writel(queue, IER, MACB_BIT(TCOMP));
 
-		/* Packet completions only seem to propagate to raise
-		 * interrupts when interrupts are enabled at the time, so if
-		 * packets were sent while interrupts were disabled,
-		 * they will not cause another interrupt to be generated when
-		 * interrupts are re-enabled.
-		 * Check for this case here to avoid losing a wakeup. This can
-		 * potentially race with the interrupt handler doing the same
-		 * actions if an interrupt is raised just after enabling them,
-		 * but this should be harmless.
+		/* TCOMP events that fire while the interrupt is masked do
+		 * not re-fire when IER is re-enabled.  Catch this two ways
+		 * to avoid losing a wakeup:
+		 *
+		 *   (1) Read ISR -- catches completions the hardware flagged
+		 *       but that we did not see as an interrupt.  The MMIO
+		 *       read doubles as a PCIe read barrier, flushing any
+		 *       in-flight descriptor TX_USED DMA writes into memory.
+		 *   (2) macb_tx_complete_pending() inspects the ring after
+		 *       that flush, catching a descriptor whose TX_USED is
+		 *       now visible as a result of the barrier.
+		 *
+		 * This can race with the interrupt handler taking the same
+		 * path if an interrupt fires just after the IER write;
+		 * rescheduling NAPI in that case is harmless.
 		 */
-		if (macb_tx_complete_pending(queue)) {
+		if ((queue_readl(queue, ISR) & MACB_BIT(TCOMP)) ||
+		    macb_tx_complete_pending(queue)) {
 			queue_writel(queue, IDR, MACB_BIT(TCOMP));
 			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
 				queue_writel(queue, ISR, MACB_BIT(TCOMP));

From 8ea87c96f1a6f8f7975fb4805099f5ba9bd9eb32 Mon Sep 17 00:00:00 2001
From: Lukasz Raczylo <lukasz@raczylo.com>
Date: Fri, 24 Apr 2026 00:00:00 +0000
Subject: [PATCH 3/3] net: macb: add TX stall watchdog as defence-in-depth
 safety net

Patches 1/3 and 2/3 address two candidate races that could lead
to a TCOMP completion being missed on PCIe-attached macb
instances.  This patch adds a defence-in-depth safety net, in
case a further race remains that we have not identified.

The watchdog is a per-queue delayed_work that runs once per
second.  Movement is tracked via a tx_stall_tail_moved boolean:
macb_tx_complete() sets it under tx_ptr_lock whenever tx_tail
advances, and the watchdog clears it under the same lock at
each tick.  If the ring is non-empty (tx_head != tx_tail) and
the boolean was still false at the next tick, the watchdog
calls macb_tx_restart().

A boolean is used in preference to snapshotting tx_tail and
comparing across ticks, because per-queue ring indices are
bounded and reused; under sustained load a snapshot comparison
can false-positive when the index happens to land on the same
value between two ticks.  Both writes share tx_ptr_lock with the
existing tx_head / tx_tail updates, so no atomic is required.

No new recovery logic is introduced.  macb_tx_restart() already
exists in this file, is correctly locked (tx_ptr_lock, bp->lock),
and verifies that the hardware's TBQP is behind the driver's
head index before re-asserting TSTART.  On a healthy ring it is
a no-op at the hardware level; the watchdog only supplies the
missing trigger.

On a healthy queue the per-tick cost is one spin_lock_irqsave()
/ spin_unlock_irqrestore(), one branch, and one byte store.  The
delayed_work is only scheduled between macb_open() and
macb_close(), and is cancelled synchronously on close.

Context for submission: on our 24-node Raspberry Pi 5 fleet,
before this series, an out-of-band user-space watchdog
(monitoring tx_packets from /sys/class/net/.../statistics and
toggling the link down/up when it froze) was required to keep
nodes usable.  We include this kernel-side watchdog as a cleaner
in-kernel equivalent for any residual stall that patches 1 and
2 do not cover.  We are willing to drop this patch if the view
is that 1 and 2 should stand alone.

Link: https://github.com/cilium/cilium/issues/43198
Link: https://bugs.launchpad.net/ubuntu/+source/linux-raspi/+bug/2133877
Signed-off-by: Lukasz Raczylo <lukasz@raczylo.com>
---
 drivers/net/ethernet/cadence/macb.h      | 11 ++++
 drivers/net/ethernet/cadence/macb_main.c | 65 ++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index 8c344f70849fb..bdbafe09714d4 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -1294,6 +1294,17 @@ struct macb_queue {
 	struct work_struct	tx_error_task;
 	bool			txubr_pending;
 	bool			tx_pending;
+
+	/* TX stall watchdog -- see macb_tx_stall_watchdog() in macb_main.c.
+	 * tx_stall_tail_moved is set by macb_tx_complete() under tx_ptr_lock
+	 * whenever tx_tail advances, and cleared by the watchdog tick on the
+	 * same lock.  A bool avoids the index-aliasing false-positive that a
+	 * snapshot-of-tx_tail comparison would have when the ring index space
+	 * happens to wrap to the same value between two ticks.
+	 */
+	struct delayed_work	tx_stall_watchdog_work;
+	bool			tx_stall_tail_moved;
+
 	struct napi_struct	napi_tx;
 
 	dma_addr_t		rx_ring_dma;
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index fbc7893b017b7..423aac02a0d47 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -1505,6 +1505,8 @@ static int macb_tx_complete(struct macb_queue *queue, int budget)
 				  packets, bytes);
 
 	queue->tx_tail = tail;
+	if (packets)
+		queue->tx_stall_tail_moved = true;
 	if (__netif_subqueue_stopped(bp->dev, queue_index) &&
 	    CIRC_CNT(queue->tx_head, queue->tx_tail,
 		     bp->tx_ring_size) <= MACB_TX_WAKEUP_THRESH(bp))
@@ -2028,6 +2030,63 @@ static int macb_tx_poll(struct napi_struct *napi, int budget)
 	return work_done;
 }
 
+#define MACB_TX_STALL_INTERVAL_MS	1000
+
+/* TX stall watchdog.
+ *
+ * Defence-in-depth against lost TCOMP interrupts.  macb already has a
+ * recovery chain (tx_pending -> txubr_pending -> macb_tx_restart())
+ * that fires on TCOMP; if TCOMP itself is lost the TX ring stalls
+ * silently until something else kicks TSTART.  This watchdog runs
+ * once per second per queue and calls macb_tx_restart() if the ring
+ * is non-empty and tx_tail has not advanced since the previous tick.
+ *
+ * Movement is tracked via the tx_stall_tail_moved boolean rather
+ * than by snapshotting tx_tail.  Per-queue ring indices are bounded
+ * (and reused), so a snapshot comparison can false-positive when the
+ * index happens to land on the same value between two ticks under
+ * sustained load.  The boolean is set by macb_tx_complete() whenever
+ * tx_tail advances and cleared by this watchdog after each tick;
+ * both writes are under tx_ptr_lock, so no atomic is required.
+ *
+ * macb_tx_restart() already checks the hardware's TBQP against the
+ * driver's head index before re-asserting TSTART, so on a healthy
+ * ring this is a no-op at the hardware level.  The watchdog only
+ * adds the missing trigger.
+ */
+static void macb_tx_stall_watchdog(struct work_struct *work)
+{
+	struct macb_queue *queue = container_of(to_delayed_work(work),
+						struct macb_queue,
+						tx_stall_watchdog_work);
+	struct macb *bp = queue->bp;
+	unsigned int cur_tail, cur_head;
+	bool stalled = false;
+	unsigned long flags;
+
+	if (!netif_running(bp->dev))
+		return;
+
+	spin_lock_irqsave(&queue->tx_ptr_lock, flags);
+	cur_tail = queue->tx_tail;
+	cur_head = queue->tx_head;
+	if (cur_head != cur_tail && !queue->tx_stall_tail_moved)
+		stalled = true;
+	queue->tx_stall_tail_moved = false;
+	spin_unlock_irqrestore(&queue->tx_ptr_lock, flags);
+
+	if (stalled) {
+		netdev_warn_once(bp->dev,
+				 "TX stall detected on queue %u (tail=%u head=%u); re-kicking TSTART\n",
+				 (unsigned int)(queue - bp->queues),
+				 cur_tail, cur_head);
+		macb_tx_restart(queue);
+	}
+
+	schedule_delayed_work(&queue->tx_stall_watchdog_work,
+			      msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS));
+}
+
 static void macb_hresp_error_task(struct work_struct *work)
 {
 	struct macb *bp = from_work(bp, work, hresp_err_bh_work);
@@ -3294,6 +3353,9 @@ static int macb_open(struct net_device *dev)
 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
 		napi_enable(&queue->napi_rx);
 		napi_enable(&queue->napi_tx);
+		queue->tx_stall_tail_moved = true;
+		schedule_delayed_work(&queue->tx_stall_watchdog_work,
+				      msecs_to_jiffies(MACB_TX_STALL_INTERVAL_MS));
 	}
 
 	macb_init_hw(bp);
@@ -3340,6 +3402,7 @@ static int macb_close(struct net_device *dev)
 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
 		napi_disable(&queue->napi_rx);
 		napi_disable(&queue->napi_tx);
+		cancel_delayed_work_sync(&queue->tx_stall_watchdog_work);
 		netdev_tx_reset_queue(netdev_get_tx_queue(dev, q));
 	}
 
@@ -4938,6 +5001,8 @@ static int macb_init(struct platform_device *pdev)
 		}
 
 		INIT_WORK(&queue->tx_error_task, macb_tx_error_task);
+		INIT_DELAYED_WORK(&queue->tx_stall_watchdog_work,
+				  macb_tx_stall_watchdog);
 		q++;
 	}