Remove _apply_temporal_reuse_corrections post-processing step

fisherxue · claude · fisherxue · commit 99e6989b1763 · 2026-03-01T18:30:15.000-05:00
Temporal reuse should be expressed purely through mapping structure
(placing Storage nodes above or below loops), not detected implicitly.
If a tensor's Storage is below an irrelevant loop, fills are inflated
— and that's correct per the mapping. Users can split Storage nodes
to control reuse explicitly.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/accelforge/model/run_model.py b/accelforge/model/run_model.py
@@ -18,7 +18,6 @@
 from accelforge.model.sparse_adjustment import (
     apply_sparse_adjustments,
     LatencyInfo,
-    _apply_temporal_reuse_corrections,
 )
 from accelforge.mapper.FFM._join_pmappings.pmapping_dataframe import (
     nameloop2col,
@@ -50,10 +49,6 @@ def run_model(
         job, add_reservations=add_reservations
     )
 
-    # Temporal reuse correction: divide inflated parent-facing stats for
-    # buffers that sit inside contiguous irrelevant temporal loops.
-    _apply_temporal_reuse_corrections(reuse, spec, job)
-
     # Phase 1: Dense latency (before sparse adjustments)
     latency = component_latency(reuse, job.flattened_arch, pmapping, spec)
     try:
diff --git a/accelforge/model/sparse_adjustment.py b/accelforge/model/sparse_adjustment.py
@@ -14,15 +14,9 @@
     Storage as StorageNode,
     Toll as TollNode,
     Compute as ComputeNode,
-    Reservation,
 )
 
 from accelforge.frontend.spec import Spec
-from accelforge.frontend._workload_isl._symbolic import (
-    get_rank_variable_relevancy,
-    Irrelevant,
-)
-from accelforge.frontend.workload import TensorName
 from accelforge.mapper.FFM._make_pmappings.pmapper_job import Job
 from accelforge.model._looptree.reuse.symbolic import (
     Compute,
@@ -237,223 +231,6 @@ def _get_tensor_rank_variables(einsum, tensor_name: str) -> set[str]:
     return rank_vars
 
 
-def _apply_temporal_reuse_corrections(
-    reuse: SymbolicAnalysisOutput,
-    spec: Spec,
-    job: Job,
-) -> None:
-    """Correct inflated fills caused by irrelevant temporal loops in the dense model.
-
-    The dense model's repeat_temporal multiplies ALL buffet stats (including
-    lower-level stats that propagate upward) by the temporal iteration count,
-    regardless of whether the loop variable is relevant to the tensor.  When
-    contiguous innermost irrelevant temporals sit above a storage zone, the
-    buffer retains data across those iterations — the "temporal reuse" concept
-    from Sparseloop.
-
-    This function computes the reuse factor for each buffet by walking the
-    per-tensor mapping upward from each Storage/Toll node, collecting the
-    innermost contiguous block of irrelevant temporal iterations (skipping
-    Spatials and Reservations, continuing through Tolls).  It then applies
-    delta-based corrections to the inflated stats and action counts.
-
-    Only corrected buffets and their parents are modified — all other buffet
-    stats remain untouched.
-    """
-    if not hasattr(reuse, "tensor2mapping") or not reuse.tensor2mapping:
-        return
-
-    workload = spec.workload
-    einsum_name = job.einsum_name
-    einsum = workload.einsums[einsum_name]
-
-    for tensor_name, mapping in reuse.tensor2mapping.items():
-        relevancy = get_rank_variable_relevancy(einsum, TensorName(tensor_name))
-        nodes = mapping.nodes
-
-        # Build a dict of temporal iteration counts by walking top-down
-        # and tracking the remaining shape at each node.
-        shape = dict(job.rank_variable_bounds)
-        node_iterations: dict[int, int] = {}  # node_index -> iteration_count
-        for idx, node in enumerate(nodes):
-            if isinstance(node, (TemporalNode, SpatialNode)):
-                rv = str(node.rank_variable) if node.rank_variable else None
-                if rv and rv in shape and node.tile_shape is not None:
-                    try:
-                        ts = int(node.tile_shape)
-                        remaining = int(shape[rv])
-                        iters = math.ceil(remaining / ts) if ts > 0 else 1
-                        node_iterations[idx] = iters
-                        shape[rv] = ts
-                    except (TypeError, ValueError):
-                        pass
-
-        # For each Storage/Toll node that holds this tensor, compute the
-        # temporal reuse factor from the zone above it.
-        for i, node in enumerate(nodes):
-            if not isinstance(node, (StorageNode, TollNode)):
-                continue
-            if tensor_name not in [str(t) for t in node.tensors]:
-                continue
-
-            buffet = Buffet(tensor_name, einsum_name, node.component)
-            if buffet not in reuse.buffet_stats:
-                continue
-
-            # Walk upward: collect contiguous innermost irrelevant temporals.
-            # Skip Spatials, Reservations, and Tolls (pass-through).
-            # Stop at relevant Temporal or Storage (parent boundary).
-            reuse_factor = 1
-            for j in range(i - 1, -1, -1):
-                above = nodes[j]
-                if isinstance(above, (SpatialNode, Reservation)):
-                    continue
-                if isinstance(above, TollNode):
-                    # Continue through Toll only if it doesn't hold the tensor
-                    # (i.e., it's a pass-through for this tensor's data path)
-                    if tensor_name in [str(t) for t in above.tensors]:
-                        continue
-                    continue
-                if isinstance(above, TemporalNode):
-                    rv = str(above.rank_variable) if above.rank_variable else None
-                    if rv and isinstance(relevancy.get(rv), Irrelevant):
-                        iters = node_iterations.get(j, 1)
-                        if iters > 1:
-                            reuse_factor *= iters
-                        continue
-                    else:
-                        break  # Relevant temporal → end of contiguous block
-                if isinstance(above, StorageNode):
-                    break  # Parent storage boundary
-
-            if reuse_factor <= 1:
-                continue
-
-            # Delta-based correction: only modify this buffet and its parent.
-            stats = reuse.buffet_stats[buffet]
-            reduction = 1.0 - 1.0 / reuse_factor  # fraction to subtract
-
-            # Save old values for delta computation.
-            old_reads_to_parent = float(stats.total_reads_to_parent)
-            old_max_reads_to_parent = float(stats.max_per_parent_reads_to_parent)
-            old_skip_reads = float(stats.total_skipped_first_reads_to_parent)
-            old_min_skip_reads = float(
-                stats.min_per_parent_skipped_first_reads_to_parent
-            )
-
-            # Correct element counts.
-            inv = 1.0 / reuse_factor
-            stats.total_reads_to_parent *= inv
-            stats.max_per_parent_reads_to_parent *= inv
-            stats.total_skipped_first_reads_to_parent *= inv
-            stats.min_per_parent_skipped_first_reads_to_parent *= inv
-
-            # Correct this buffet's fill action counts (write_actions from fills).
-            component_obj = spec.arch.find(buffet.level)
-            if not isinstance(component_obj, arch.TensorHolder):
-                continue
-            ta = _find_tensor_access(einsum, buffet.tensor)
-            if ta is None:
-                continue
-            count_writes = not isinstance(component_obj, arch.Toll)
-            if count_writes:
-                bpvs = component_obj.bits_per_value_scale[buffet.tensor]
-                bpv = bpvs * ta.bits_per_value
-                write_bpa = component_obj.actions["write"].bits_per_action
-                write_scale = bpv / write_bpa
-
-                delta_write = old_reads_to_parent * reduction * write_scale
-                stats.total_write_actions -= delta_write
-                stats.max_per_unit_write_actions -= delta_write
-                delta_skip_write = old_skip_reads * reduction * write_scale
-                stats.total_skipped_first_write_actions -= delta_skip_write
-                stats.min_per_unit_skipped_first_write_actions -= delta_skip_write
-
-            # Propagate correction upward through the buffet chain.
-            # Tolls with propagate_child_results add child.reads_to_parent
-            # to their own reads_to_parent via inherit_add BEFORE spatial/
-            # temporal multiplications. Thus the absolute delta in
-            # reads_to_parent is the same at every level in the chain.
-            #
-            # Walk up: correct each Toll's reads_to_parent and action counts,
-            # then correct the first Storage parent's read action counts.
-            delta_reads = old_reads_to_parent * reduction
-            delta_max_reads = old_max_reads_to_parent * reduction
-            delta_skip = old_skip_reads * reduction
-            delta_min_skip = old_min_skip_reads * reduction
-
-            cur = buffet
-            while True:
-                parent_buffet = _get_parent_buffet(reuse, cur)
-                if parent_buffet is None:
-                    break
-                parent_stats = reuse.buffet_stats[parent_buffet]
-                parent_obj = spec.arch.find(parent_buffet.level)
-                if not isinstance(parent_obj, arch.TensorHolder):
-                    break
-
-                p_bpvs = parent_obj.bits_per_value_scale[parent_buffet.tensor]
-                p_bpv = p_bpvs * ta.bits_per_value
-                p_read_bpa = parent_obj.actions["read"].bits_per_action
-                p_read_scale = p_bpv / p_read_bpa
-                is_toll = isinstance(parent_obj, arch.Toll)
-
-                if is_toll:
-                    # Toll: correct its reads_to_parent (inherited from child)
-                    # and continue upward.
-                    parent_stats.total_reads_to_parent -= delta_reads
-                    parent_stats.max_per_parent_reads_to_parent -= delta_max_reads
-                    parent_stats.total_skipped_first_reads_to_parent -= delta_skip
-                    parent_stats.min_per_parent_skipped_first_reads_to_parent -= (
-                        delta_min_skip
-                    )
-                    # Toll read_actions (serving child) — usually 0 energy.
-                    parent_stats.total_read_actions -= delta_reads * p_read_scale
-                    parent_stats.max_per_unit_read_actions -= (
-                        delta_max_reads * p_read_scale
-                    )
-                    parent_stats.total_skipped_first_read_actions -= (
-                        delta_skip * p_read_scale
-                    )
-                    parent_stats.min_per_unit_skipped_first_read_actions -= (
-                        delta_min_skip * p_read_scale
-                    )
-                    cur = parent_buffet
-                    continue
-                else:
-                    # Storage: correct read actions from serving child fills.
-                    parent_stats.total_read_actions -= delta_reads * p_read_scale
-                    parent_stats.max_per_unit_read_actions -= (
-                        delta_max_reads * p_read_scale
-                    )
-                    parent_stats.total_skipped_first_read_actions -= (
-                        delta_skip * p_read_scale
-                    )
-                    parent_stats.min_per_unit_skipped_first_read_actions -= (
-                        delta_min_skip * p_read_scale
-                    )
-                    break
-
-
-def _get_parent_buffet(
-    reuse: SymbolicAnalysisOutput,
-    buffet: Buffet,
-) -> Buffet | None:
-    """Find the parent (outer-level) Buffet key for the same tensor.
-
-    buffet_stats are ordered inner-to-outer, so the parent is the next
-    matching entry after the current buffet in forward iteration order.
-    """
-    seen = False
-    for b in reuse.buffet_stats:
-        if not seen:
-            seen = b == buffet
-            continue
-        if b.tensor == buffet.tensor and b.einsum == buffet.einsum:
-            return b
-    return None
-
-
 def _compute_buffet_tile_shapes(
     reuse: SymbolicAnalysisOutput,
     job: Job,