From 140b3a53c7574790fe23d40e58781e6d0232be68 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Tue, 19 May 2026 15:46:02 +0100
Subject: [PATCH 1/8] compiler: Attach deriv_order to IndexDerivative

---
 devito/finite_differences/differentiable.py    | 12 ++++++++++--
 devito/finite_differences/finite_difference.py |  4 +++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py
index 6322b8ca8f..cfaa31ebc7 100644
--- a/devito/finite_differences/differentiable.py
+++ b/devito/finite_differences/differentiable.py
@@ -985,8 +985,9 @@ def value(self, idx):
 class IndexDerivative(IndexSum):
 
     __rargs__ = ('expr', 'mapper')
+    __rkwargs__ = IndexSum.__rkwargs__ + ('deriv_order',)
 
-    def __new__(cls, expr, mapper, **kwargs):
+    def __new__(cls, expr, mapper, deriv_order=None, **kwargs):
         dimensions = as_tuple(set(mapper.values()))
 
         # Detect the Weights among the arguments
@@ -1008,6 +1009,8 @@ def __new__(cls, expr, mapper, **kwargs):
         obj._weights = weights
         obj._mapper = frozendict(mapper)
 
+        obj._deriv_order = deriv_order
+
         return obj
 
     def _hashable_content(self):
@@ -1036,6 +1039,10 @@ def weights(self):
     def mapper(self):
         return self._mapper
 
+    @property
+    def deriv_order(self):
+        return self._deriv_order
+
     @property
     def depth(self):
         iderivs = self.expr.find(IndexDerivative)
@@ -1212,7 +1219,8 @@ def _diff2sympy(obj):
 
         # Handle special objects
         if isinstance(obj, DiffDerivative):
-            return IndexDerivative(*args, obj.mapper), True
+            return IndexDerivative(*args, obj.mapper,
+                                   deriv_order=obj.deriv_order), True
 
         # Handle generic objects such as arithmetic operations
         try:
diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py
index 30199fb3d8..33c1a95a30 100644
--- a/devito/finite_differences/finite_difference.py
+++ b/devito/finite_differences/finite_difference.py
@@ -209,7 +209,9 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, coeffici
         with suppress(AttributeError):
             expr = expr._evaluate(expand=False)
 
-        deriv = DiffDerivative(expr*weights, {dim: indices.free_dim})
+        deriv = DiffDerivative(
+            expr*weights, {dim: indices.free_dim}, deriv_order=deriv_order
+        )
     else:
         terms = []
         for i, c in zip(indices, weights, strict=True):

From 83c510d21df7d3d1b83cd31307c233c83262d7fc Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 25 May 2026 11:20:05 +0100
Subject: [PATCH 2/8] compiler: Add cire-maxpar=compact mode

---
 devito/core/gpu.py                |  2 +-
 devito/core/operator.py           |  3 ++
 devito/passes/clusters/aliases.py | 68 +++++++++++++++++++++++++------
 tests/test_dse.py                 | 13 +++---
 4 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/devito/core/gpu.py b/devito/core/gpu.py
index bd13623441..06cb42eb37 100644
--- a/devito/core/gpu.py
+++ b/devito/core/gpu.py
@@ -78,7 +78,7 @@ def _normalize_kwargs(cls, **kwargs):
         # CIRE
         o['min-storage'] = False
         o['cire-rotate'] = False
-        o['cire-maxpar'] = oo.pop('cire-maxpar', True)
+        o['cire-maxpar'] = oo.pop('cire-maxpar', 'basic')
         o['cire-ftemps'] = oo.pop('cire-ftemps', False)
         o['cire-mingain'] = oo.pop('cire-mingain', cls.CIRE_MINGAIN)
         o['cire-minmem'] = oo.pop('cire-minmem', cls.CIRE_MINMEM)
diff --git a/devito/core/operator.py b/devito/core/operator.py
index 8a52cbf98a..16c403ba79 100644
--- a/devito/core/operator.py
+++ b/devito/core/operator.py
@@ -240,6 +240,9 @@ def _check_kwargs(cls, **kwargs):
         if oo['mpi'] and oo['mpi'] not in cls.MPI_MODES:
             raise InvalidOperator(f"Unsupported MPI mode `{oo['mpi']}`")
 
+        if oo['cire-maxpar'] not in (False, 'basic', 'compact'):
+            raise InvalidOperator("Illegal `cire-maxpar` value")
+
         if oo['cse-algo'] not in ('basic', 'smartsort', 'advanced'):
             raise InvalidOperator("Illegal `cse-algo` value")
 
diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py
index 58078847a2..20647c2084 100644
--- a/devito/passes/clusters/aliases.py
+++ b/devito/passes/clusters/aliases.py
@@ -140,8 +140,10 @@ def _aliases_from_clusters(self, cgroup, exclude, meta):
         schedule, exprs = self._select(variants)
 
         # Schedule -> Schedule (optimization)
-        if self.opt_maxpar:
-            schedule = optimize_schedule_maxpar(schedule)
+        if self.opt_maxpar == 'basic':
+            schedule = optimize_schedule_maxpar_basic(schedule)
+        elif self.opt_maxpar == 'compact':
+            schedule = optimize_schedule_maxpar_compact(schedule)
 
         # Schedule -> Schedule (optimization)
         if self.opt_rotate:
@@ -285,7 +287,7 @@ class CireInvariants(CireTransformerLegacy, Queue):
     def __init__(self, sregistry, options, platform):
         super().__init__(sregistry, options, platform)
 
-        self.opt_maxpar = True
+        self.opt_maxpar = 'basic'
         self.opt_schedule_strategy = None
 
     def process(self, clusters):
@@ -756,8 +758,11 @@ def lower_aliases(aliases, meta, maxpar):
                                 meta.ispace.directions)
         ispace = ispace.augment(sub_iterators)
 
+        # For now, the guards coincide with the original ones, if any
+        guards = meta.guards
+
         processed.append(
-            ScheduledAlias(a.pivot, writeto, ispace, a.aliaseds, indicess)
+            ScheduledAlias(a.pivot, writeto, ispace, a.aliaseds, indicess, guards)
         )
 
     # The [ScheduledAliases] must be ordered so as to reuse as many of the
@@ -874,7 +879,7 @@ def optimize_schedule_rotations(schedule, sregistry):
             ispace = IterationSpace.union(rispace, aispace, relations={(d, cd, d1)})
 
             processed.append(ScheduledAlias(
-                pivot, writeto, ispace, i.aliaseds, indicess,
+                pivot, writeto, ispace, i.aliaseds, indicess, i.guards
             ))
 
         # Update the rotations mapper
@@ -883,9 +888,11 @@ def optimize_schedule_rotations(schedule, sregistry):
     return schedule.rebuild(*processed, rmapper=rmapper)
 
 
-def optimize_schedule_maxpar(schedule):
+def optimize_schedule_maxpar_basic(schedule):
     """
-    Bump the IterationSpace' stamp trading fusion for more collapse-parallelism.
+    Trade fusion for more collapse-parallelism. This is more conservative than
+    `optimize_schedule_maxpar_compact`, since if two IterationSpaces have
+    different endpoints along the same Dimensions, they won't be fused together.
     """
     key = lambda i: (i.writeto, i.ispace)
 
@@ -900,13 +907,49 @@ def optimize_schedule_maxpar(schedule):
         ispace = ispace0.lift(dims, stamp)
 
         processed.extend([
-            ScheduledAlias(pivot, writeto, ispace, aliaseds, indicess)
-            for pivot, _, _, aliaseds, indicess in g
+            ScheduledAlias(pivot, writeto, ispace, aliaseds, indicess, guards)
+            for pivot, _, _, aliaseds, indicess, guards in g
         ])
 
     return schedule.rebuild(*processed)
 
 
+def optimize_schedule_maxpar_compact(schedule):
+    """
+    Aggressively trade fusion for more collapse-parallelism. Unlike
+    `optimize_schedule_maxpar_basic`, this will fuse together IterationSpaces with
+    different endpoints along the same Dimensions by introducing guards to
+    preserve the original semantics.
+    """
+    stamp = Stamp()
+
+    # The union IterationSpace will enable maximal fusion
+    ispace_union = IterationSpace.union(*[i.ispace for i in schedule])
+
+    processed = []
+    for pivot, writeto, ispace, aliaseds, indicess, guards in schedule:
+        dims = writeto.itdims
+
+        # Do we need to introduce guards to preserve the original semantics?
+        for d in dims:
+            int0 = ispace[d]
+            int1 = ispace_union[d]
+
+            if int0 != int1:
+                guard = sympy.And(d >= int0.symbolic_min, d <= int0.symbolic_max)
+                guards = guards.xandg(d, guard)
+
+        # This is conceptually identical to what we do for the `basic` mode
+        writeto = writeto.lift(dims, stamp)
+        ispace = ispace_union.lift(dims, stamp)
+
+        processed.append(ScheduledAlias(
+            pivot, writeto, ispace, aliaseds, indicess, guards
+        ))
+
+    return schedule.rebuild(*processed)
+
+
 def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype,
                    opt_minmem):
     """
@@ -917,7 +960,7 @@ def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype,
 
     clusters = []
     subs = {}
-    for pivot, writeto, ispace, aliaseds, indicess in schedule:
+    for pivot, writeto, ispace, aliaseds, indicess, guards in schedule:
         name = sregistry.make_name()
         # Infer the dtype for the pivot
         # This prevents cases such as `floor(a*b)` with `a` and `b` floats
@@ -1013,7 +1056,7 @@ def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype,
             properties[Hyperplane(writeto.itdims)] = {SEPARABLE}
 
         # Finally, build the alias Cluster
-        clusters.append(Cluster(expression, ispace, meta.guards, properties))
+        clusters.append(Cluster(expression, ispace, guards, properties))
 
     return clusters, subs
 
@@ -1432,7 +1475,8 @@ def is_frame(self):
 
 
 ScheduledAlias = namedtuple('SchedAlias',
-                            'pivot writeto ispace aliaseds indicess')
+                            'pivot writeto ispace aliaseds indicess guards')
+ScheduledAlias.__new__.__defaults__ = (None, None, None, None, None, None)
 
 
 class Schedule(tuple):
diff --git a/tests/test_dse.py b/tests/test_dse.py
index 3dcbd40336..9c3335c8af 100644
--- a/tests/test_dse.py
+++ b/tests/test_dse.py
@@ -2164,7 +2164,7 @@ def test_derivatives_from_different_levels(self):
     @pytest.mark.parametrize('rotate', [False, True])
     def test_maxpar_option(self, rotate):
         """
-        Test the compiler option `cire-maxpar=True`.
+        Test the compiler option `cire-maxpar='basic'`.
         """
         grid = Grid(shape=(10, 10, 10))
 
@@ -2179,7 +2179,8 @@ def test_maxpar_option(self, rotate):
         eq = Eq(u.forward, f*u.dy.dy)
 
         op0 = Operator(eq, opt='noop')
-        op1 = Operator(eq, opt=('advanced', {'cire-maxpar': True, 'cire-rotate': rotate}))
+        op1 = Operator(eq, opt=('advanced', {'cire-maxpar': 'basic',
+                                             'cire-rotate': rotate}))
 
         # Check code generation
         bns, _ = assert_blocking(op1, {'x0_blk0'})
@@ -2248,7 +2249,7 @@ def test_multiple_rotating_dims(self):
 
     def test_maxpar_option_v2(self):
         """
-        Another test for the compiler option `cire-maxpar=True`.
+        Another test for the compiler option `cire-maxpar='basic'`.
         """
         grid = Grid(shape=(10, 10, 10))
 
@@ -2263,7 +2264,7 @@ def test_maxpar_option_v2(self):
         eq = Eq(u.forward, f*u.dx.dx)
 
         op0 = Operator(eq, opt='noop')
-        op1 = Operator(eq, opt=('advanced', {'cire-maxpar': True}))
+        op1 = Operator(eq, opt=('advanced', {'cire-maxpar': 'basic'}))
 
         # Check code generation
         bns, _ = assert_blocking(op1, {'x0_blk0'})
@@ -2279,7 +2280,7 @@ def test_maxpar_option_v2(self):
 
     def test_maxpar_option_v3(self):
         """
-        Another test for the compiler option `cire-maxpar=True`.
+        Another test for the compiler option `cire-maxpar='basic'`.
         """
         grid = Grid(shape=(10, 10))
 
@@ -2288,7 +2289,7 @@ def test_maxpar_option_v3(self):
 
         eq = Eq(u.forward, u.dx.dx + v.dx.dy)
 
-        op = Operator(eq, opt=('advanced', {'cire-maxpar': True}))
+        op = Operator(eq, opt=('advanced', {'cire-maxpar': 'basic'}))
 
         # Check code generation
         xs, ys = get_params(op, 'x_size', 'y_size')

From 1ef0ada99da2f868222f14a031736ec2df30811a Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 25 May 2026 11:20:28 +0100
Subject: [PATCH 3/8] compiler: Change blocking strategy in presence of guards

---
 devito/ir/clusters/visitors.py     | 21 ++++++++++++---------
 devito/passes/clusters/blocking.py | 13 ++++++++++---
 tests/test_dimension.py            | 21 +++++++++++++++++++++
 3 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/devito/ir/clusters/visitors.py b/devito/ir/clusters/visitors.py
index 11bcad5365..371ed4e406 100644
--- a/devito/ir/clusters/visitors.py
+++ b/devito/ir/clusters/visitors.py
@@ -37,15 +37,7 @@ def _make_key(self, cluster, level):
         assert self._q_ispace_in_key
         ispace = cluster.ispace[:level]
 
-        if self._q_guards_in_key:
-            try:
-                guards = tuple(cluster.guards.get(i.dim) for i in ispace)
-            except AttributeError:
-                # `cluster` is actually a ClusterGroup
-                assert len(cluster.guards) == 1
-                guards = tuple(cluster.guards[0].get(i.dim) for i in ispace)
-        else:
-            guards = None
+        guards = self._make_key_guards(cluster, ispace)
 
         if self._q_properties_in_key:
             properties = cluster.properties.drop(cluster.ispace[level:].itdims)
@@ -68,6 +60,17 @@ def _make_key(self, cluster, level):
 
         return (prefix,) + subkey
 
+    def _make_key_guards(self, cluster, ispace):
+        if not self._q_guards_in_key:
+            return None
+
+        try:
+            return tuple(cluster.guards.get(i.dim) for i in ispace)
+        except AttributeError:
+            # `cluster` is actually a ClusterGroup
+            assert len(cluster.guards) == 1
+            return tuple(cluster.guards[0].get(i.dim) for i in ispace)
+
     def _make_key_hook(self, cluster, level):
         return ()
 
diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py
index d04962a52c..e9076bfa85 100644
--- a/devito/passes/clusters/blocking.py
+++ b/devito/passes/clusters/blocking.py
@@ -316,8 +316,6 @@ def callback(self, clusters, prefix):
 
 class SynthesizeBlocking(Queue):
 
-    _q_guards_in_key = True
-
     template = "%s%d_blk%s"
 
     def __init__(self, sregistry, options):
@@ -339,6 +337,13 @@ def process(self, clusters):
 
         return self._process_fdta(clusters, 1, blk_size_gen=blk_size_gen)
 
+    def _make_key_guards(self, cluster, ispace):
+        # Essentially, this will ensure that loop nests within guards *not*
+        # pertaining to the blockable Dimension are given a dedicated set of
+        # BlockDimensions, e.g. see `test_dimension.py::test_no_fusion_convoluted`
+        return tuple(cluster.guards.get(i.dim) for i in ispace
+                     if not cluster.properties.is_blockable(i.dim))
+
     def _derive_block_dims(self, clusters, prefix, d, blk_size_gen):
         if blk_size_gen is not None:
             step = sympify(blk_size_gen.next(prefix, d, clusters))
@@ -397,7 +402,9 @@ def callback(self, clusters, prefix, blk_size_gen=None):
                 # Use the innermost BlockDimension in place of `d`
                 subs = {d: bd}
                 exprs = [uxreplace(e, subs) for e in c.exprs]
-                guards = {subs.get(i, i): v for i, v in c.guards.items()}
+                guards = {
+                    subs.get(i, i): uxreplace(v, subs) for i, v in c.guards.items()
+                }
 
                 # The new Cluster properties -- TILABLE is dropped after blocking
                 properties = c.properties.drop(d)
diff --git a/tests/test_dimension.py b/tests/test_dimension.py
index 713378fb4f..665a31af77 100644
--- a/tests/test_dimension.py
+++ b/tests/test_dimension.py
@@ -2058,6 +2058,27 @@ def test_factor_and_condition(self):
         for t in range(buffer_size):
             assert np.all(usaved.data[t] == t*factor + bounds[0] - 1)
 
+    def test_blocking_w_guard(self):
+        grid = Grid(shape=(8, 8, 8))
+        x, y, z = grid.dimensions
+
+        u = TimeFunction(name="u", grid=grid, space_order=2)
+        v = u.func(name="v")
+
+        cdim = ConditionalDimension(name="cdim", parent=y, condition=Ge(y, 4))
+
+        eqns = [Eq(u.forward, u.dx + v.dy + 1),
+                Eq(v.forward, v.dx + u.dy + 1, implicit_dims=cdim)]
+
+        op = Operator(eqns, opt=('advanced', {'openmp': True}))
+
+        op.cfunction
+
+        assert_structure(op, ['t,x0_blk0,y0_blk0,x,y,z',
+                              't,x0_blk0,y0_blk0,x,y',
+                              't,x0_blk0,y0_blk0,x,y,z'],
+                         't,x0_blk0,y0_blk0,x,y,z,z')
+
 
 class TestCustomDimension:
 

From ca96cdf58bb775a141e9010d1d4bd46593c445ce Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 1 Jun 2026 08:44:32 +0100
Subject: [PATCH 4/8] tests: Remove useless sympy.Float cast

---
 tests/test_symbolics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_symbolics.py b/tests/test_symbolics.py
index aab8502a20..5529870e5e 100644
--- a/tests/test_symbolics.py
+++ b/tests/test_symbolics.py
@@ -579,7 +579,7 @@ def test_component_access_symbol_printing():
     acc = dSymbol(name='acc', dtype=dtypes_vector_mapper[(np.float32, 4)])
     expr = ComponentAccess(acc, 0)
 
-    assert ccode(sympy.Float('1.25')*expr, dtype=expr.dtype) == '1.250F*acc.x'
+    assert ccode(1.25*expr, dtype=expr.dtype) == '1.250F*acc.x'
 
 
 def test_vector_access():

From 3de03cafbd4040262c5d37d0048286f707fbceb1 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Wed, 3 Jun 2026 11:38:35 +0100
Subject: [PATCH 5/8] compiler: Fix DDA involving TensorMoves

---
 devito/ir/support/basic.py         | 30 +++++++++++++++++++--------
 devito/symbolics/extended_sympy.py | 11 +++++++++-
 devito/types/parallel.py           | 33 +++++++++++++++++++++++++-----
 3 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py
index 7939ee8fe8..6d91981e0a 100644
--- a/devito/ir/support/basic.py
+++ b/devito/ir/support/basic.py
@@ -15,11 +15,11 @@
 )
 from devito.tools import (
     CacheInstances, Tag, as_mapper, as_tuple, filter_sorted, flatten, is_integer,
-    memoized_generator, memoized_meth, smart_gt, smart_lt
+    memoized_generator, memoized_meth, smart_gt, smart_lt, split
 )
 from devito.types import (
     ComponentAccess, CriticalRegion, Dimension, DimensionTuple, Fence, Function, Symbol,
-    TBArray, Temp, TempArray
+    TBArray, Temp, TempArray, TensorMove
 )
 
 __all__ = ['ExprGeometry', 'IterationInstance', 'Scope', 'TimedAccess']
@@ -1383,19 +1383,31 @@ def vinf(entries):
 
 def retrieve_accesses(exprs, **kwargs):
     """
-    Like retrieve_terminals, but ensure that if a ComponentAccess is found,
-    the ComponentAccess itself is returned, while the wrapped Indexed is discarded.
+    Similar to `retrieve_terminals`, but with some adjustments:
+
+      * ComponentAccess's are retained, but the wrapped Indexed are discarded;
+      * TensorMove's are upcasted to the logical Indexed they represent.
     """
     kwargs['mode'] = 'unique'
 
     compaccs = search(exprs, ComponentAccess)
-    if not compaccs:
-        return retrieve_terminals(exprs, **kwargs)
 
-    subs = {i: Symbol(f'dummy{n}') for n, i in enumerate(compaccs)}
-    exprs1 = uxreplace(exprs, subs)
+    if compaccs:
+        # Handle ComponentAccesses
+        subs = {i: Symbol(f'dummy{n}') for n, i in enumerate(compaccs)}
+        exprs1 = uxreplace(exprs, subs)
+        terms1 = retrieve_terminals(exprs1, **kwargs)
+
+        accesses = compaccs | terms1 - set(subs.values())
+    else:
+        accesses = retrieve_terminals(exprs, **kwargs)
+
+    # Handle TensorMoves
+    key = lambda i: isinstance(i, TensorMove)
+    tmovs, other = split(accesses, key)
+    accesses = {i.access for i in tmovs} | other
 
-    return compaccs | retrieve_terminals(exprs1, **kwargs) - set(subs.values())
+    return accesses
 
 
 def disjoint_test(e0, e1, d, it):
diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py
index 86979c8bbd..7e466cf352 100644
--- a/devito/symbolics/extended_sympy.py
+++ b/devito/symbolics/extended_sympy.py
@@ -923,7 +923,12 @@ def _expected_alignment_elems(self):
 
     @property
     def base(self):
-        return self.args[0]
+        """
+        The IndexedBase of the aligned access.
+
+        To be implemented by subclasses.
+        """
+        raise NotImplementedError
 
     func = Reserved._rebuild
 
@@ -965,6 +970,10 @@ def _expected_alignment(self):
                 f"Unsupported dtype `{self.function.dtype}` for VectorAccess"
             )
 
+    @property
+    def base(self):
+        return self.args[0]
+
     @property
     def indices(self):
         return self.base.indices
diff --git a/devito/types/parallel.py b/devito/types/parallel.py
index 09498df986..b92978de1f 100644
--- a/devito/types/parallel.py
+++ b/devito/types/parallel.py
@@ -408,25 +408,40 @@ class TensorMove(AlignedAccess, Terminal):
     Represent the LOAD/STORE of a multi-dimensional block of data from/to a higher
     level of the memory hierarchy.
 
+    TensorMoves can move blocks of data of arbitrary size without worrying about
+    potential out-of-bounds (OOB) accesses:
+
+        * if an OOB point is read, the retrieved tensor is filled in with Nans/0s;
+        * attempted writes of OOB points via tensors exceeding the legal data region
+          are automatically discarded.
+
+    Thus, TensorMoves do not require OOB protection with guards.
+
+    However, the coordinates must be greater or equal than 0.
+
     Parameters
     ----------
-    base : IndexedBase
-        The base of the AbstractFunction subject of the TensorMove.
+    access : Indexed
+        The logical Indexed the TensorMove represents.
     tid0 : Dimension
         A representation of thread(s) issuing the TensorMove.
     coords : tuple
         The base address of the TensorMove (one point per Dimension).
     """
 
-    __rargs__ = ('base', 'tid0', 'coords')
+    __rargs__ = ('access', 'tid0', 'coords')
 
     _expected_alignment = 16
     """
     The expected alignment in bytes for the underlying LOAD/STORE operation.
     """
 
-    def __new__(cls, base, tid0, coords, **kwargs):
-        return super().__new__(cls, base, tid0, coords)
+    def __new__(cls, access, tid0, coords, **kwargs):
+        return super().__new__(cls, access, tid0, coords)
+
+    @property
+    def access(self):
+        return self.args[0]
 
     @property
     def tid0(self):
@@ -436,6 +451,14 @@ def tid0(self):
     def coords(self):
         return self.args[2]
 
+    @property
+    def function(self):
+        return self.access.function
+
+    @property
+    def base(self):
+        return self.function.base
+
     @cached_property
     def indexed(self):
         return self.function[self.coords]

From f67a3c1e8c32f866bb62344979874cbcdec01898 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Fri, 5 Jun 2026 17:43:59 +0100
Subject: [PATCH 6/8] compiler: Revamp to enable blocking before CIRE

---
 devito/core/cpu.py                 |   8 +-
 devito/core/gpu.py                 |  10 +-
 devito/core/operator.py            |   7 +
 devito/ir/clusters/cluster.py      |  37 +++-
 devito/ir/support/guards.py        |  25 +++
 devito/ir/support/properties.py    |  45 ++--
 devito/ir/support/space.py         |  34 ++-
 devito/ir/support/syncs.py         |   3 +
 devito/passes/clusters/aliases.py  | 324 ++++++++++++++++++-----------
 devito/passes/clusters/blocking.py | 153 +++++++-------
 devito/passes/iet/engine.py        |  16 +-
 devito/types/dimension.py          |  38 ++++
 tests/test_dle.py                  |  56 -----
 tests/test_dse.py                  |  22 ++
 14 files changed, 481 insertions(+), 297 deletions(-)

diff --git a/devito/core/cpu.py b/devito/core/cpu.py
index 3a55a4a4c5..79d1b409e7 100644
--- a/devito/core/cpu.py
+++ b/devito/core/cpu.py
@@ -5,8 +5,8 @@
 from devito.operator.operator import rcompile
 from devito.passes import stream_dimensions
 from devito.passes.clusters import (
-    Lift, blocking, buffering, cire, cse, factorize, fission, fuse, optimize_hyperplanes,
-    optimize_pows
+    Lift, apply_par_tiles, blocking, buffering, cire, cse, factorize, fission, fuse,
+    optimize_hyperplanes, optimize_pows
 )
 from devito.passes.equations import collect_derivatives
 from devito.passes.iet import (
@@ -67,6 +67,7 @@ def _normalize_kwargs(cls, **kwargs):
                                 reduce=oo.pop('par-tile-reduce', None))
 
         # CIRE
+        o['cire-block-temps'] = oo.pop('cire-block-temps', cls.CIRE_BLOCK_TEMPS)
         o['min-storage'] = oo.pop('min-storage', False)
         o['cire-rotate'] = oo.pop('cire-rotate', False)
         o['cire-maxpar'] = oo.pop('cire-maxpar', False)
@@ -198,6 +199,9 @@ def _specialize_clusters(cls, clusters, **kwargs):
         if options['blocklazy']:
             clusters = blocking(clusters, sregistry, options)
 
+        # Unfold the `par-tile`s, if any
+        clusters = apply_par_tiles(clusters, **kwargs)
+
         return clusters
 
     @classmethod
diff --git a/devito/core/gpu.py b/devito/core/gpu.py
index 06cb42eb37..f907d4be87 100644
--- a/devito/core/gpu.py
+++ b/devito/core/gpu.py
@@ -7,8 +7,8 @@
 from devito.operator.operator import rcompile
 from devito.passes import is_on_device, stream_dimensions
 from devito.passes.clusters import (
-    Lift, blocking, buffering, cire, cse, factorize, fission, fuse, memcpy_prefetch,
-    optimize_pows, tasking
+    Lift, apply_par_tiles, blocking, buffering, cire, cse, factorize, fission, fuse,
+    memcpy_prefetch, optimize_pows, tasking
 )
 from devito.passes.equations import collect_derivatives
 from devito.passes.iet import (
@@ -38,7 +38,9 @@
 
 class DeviceOperatorMixin:
 
+    # Overrides the default values in the main Operator class
     BLOCK_LEVELS = 0
+    CIRE_BLOCK_TEMPS = False
     MPI_MODES = (True, 'basic',)
 
     GPU_FIT = 'all-fallback'
@@ -76,6 +78,7 @@ def _normalize_kwargs(cls, **kwargs):
         o['skewing'] = oo.pop('skewing', False)
 
         # CIRE
+        o['cire-block-temps'] = oo.pop('cire-block-temps', cls.CIRE_BLOCK_TEMPS)
         o['min-storage'] = False
         o['cire-rotate'] = False
         o['cire-maxpar'] = oo.pop('cire-maxpar', 'basic')
@@ -239,6 +242,9 @@ def _specialize_clusters(cls, clusters, **kwargs):
         if options['blocklazy']:
             clusters = blocking(clusters, sregistry, options)
 
+        # Unfold the `par-tile`s, if any
+        clusters = apply_par_tiles(clusters, **kwargs)
+
         return clusters
 
     @classmethod
diff --git a/devito/core/operator.py b/devito/core/operator.py
index 16c403ba79..2494dbd7cf 100644
--- a/devito/core/operator.py
+++ b/devito/core/operator.py
@@ -58,6 +58,13 @@ class BasicOperator(Operator):
     situations where the performance impact might be detrimental.
     """
 
+    CIRE_BLOCK_TEMPS = True
+    """
+    If an aliasing expression is computed within a blocked loop nest, all CIRE-
+    generated temporaries will inherit the block shape. If set to False, the
+    temporaries shape will systematically be defined by the root Dimensions.
+    """
+
     CIRE_MINGAIN = 10
     """
     Minimum operation count reduction for a redundant expression to be optimized
diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py
index cbf206b3ff..f40855121e 100644
--- a/devito/ir/clusters/cluster.py
+++ b/devito/ir/clusters/cluster.py
@@ -13,7 +13,7 @@
 )
 from devito.mpi.halo_scheme import HaloScheme, HaloTouch
 from devito.mpi.reduction_scheme import DistReduce
-from devito.symbolics import estimate_cost
+from devito.symbolics import estimate_cost, uxreplace
 from devito.tools import as_tuple, filter_ordered, flatten, infer_dtype
 from devito.types import (
     CriticalRegion, Fence, Indexed, PhaseMarker, TensorMove, ThreadArrive, ThreadCommit,
@@ -128,6 +128,33 @@ def rebuild(self, *args, **kwargs):
                               syncs=kwargs.get('syncs', self.syncs),
                               halo_scheme=kwargs.get('halo_scheme', self.halo_scheme))
 
+    def subs(self, mapper, compact=()):
+        """
+        Build a new Cluster applying substitutions rules to `self`.
+        """
+        if not mapper:
+            return self
+
+        if self.halo_scheme:
+            raise NotImplementedError
+
+        key0 = lambda i: i.is_Block
+        subs0 = {d: self.ispace[d].promote(key0).dim for d in compact}
+
+        subs = {**mapper, **subs0}
+        exprs = [uxreplace(e, subs) for e in self.exprs]
+
+        ispace = self.ispace.switch(mapper)
+        key = lambda i: key0(i) and i in flatten(d._defines for d in subs0)
+        ispace = ispace.promote(key, mode='total')
+
+        guards = self.guards.subs(mapper).promote(subs0)
+        properties = self.properties.subs(mapper).promote(subs0)
+        syncs = self.syncs.subs(mapper)
+
+        return self.__class__(exprs=exprs, ispace=ispace, guards=guards,
+                              properties=properties, syncs=syncs)
+
     @property
     def exprs(self):
         return self._exprs
@@ -591,6 +618,14 @@ def dspace(self):
         """Return the DataSpace of this ClusterGroup."""
         return DataSpace.union(*[i.dspace.reset() for i in self])
 
+    @property
+    def is_dense(self):
+        return all(i.is_dense for i in self)
+
+    @property
+    def is_wild(self):
+        return all(i.is_wild for i in self)
+
     @property
     def is_halo_touch(self):
         return all(i.is_halo_touch for i in self)
diff --git a/devito/ir/support/guards.py b/devito/ir/support/guards.py
index b8a335b1f4..ac808eeb89 100644
--- a/devito/ir/support/guards.py
+++ b/devito/ir/support/guards.py
@@ -342,6 +342,31 @@ def as_map(self, d, cls):
 
         return dict(i.args for i in search(self.get(d), cls))
 
+    def subs(self, mapper):
+        m = {mapper.get(d, d): v.xreplace(mapper) for d, v in self.items()}
+
+        return Guards(m)
+
+    def promote(self, subs):
+        m = self
+        for d, v in subs.items():
+            guards = {self.get(i) for i in d._defines} - {true}
+            if len(guards) > 1:
+                raise NotImplementedError(
+                    f"Cannot promote {d} to {v} due to multiple guards: {guards}"
+                )
+            elif len(guards) == 0:
+                continue
+
+            guard = guards.pop()
+            guard = guard.xreplace({d: v})
+
+            m = m.impose(v, guard)
+
+        m = m.popany(subs)
+
+        return m
+
 
 class GuardExpr(LocalObject, BooleanFunction):
 
diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py
index 9e787a8b9e..b0033b2485 100644
--- a/devito/ir/support/properties.py
+++ b/devito/ir/support/properties.py
@@ -39,10 +39,11 @@ def __init__(self, name, val=None):
 TILABLE = Property('tilable')
 """A fully parallel Dimension that would benefit from tiling (or "blocking")."""
 
-TILABLE_SMALL = Property('tilable*')
+NO_TUNING = Property('notuning')
 """
-Like TILABLE, but it would benefit from relatively small block, since the
-iteration space is likely to be very small.
+A Dimension that would be unlikely to benefit from tuning. For example, the
+underlying iteration space is relatively small, and/or the enclosed expressions
+are not characterized by data reuse, etc.
 """
 
 SKEWABLE = Property('skewable')
@@ -115,7 +116,6 @@ def __init__(self, name, val=None):
 
 # Bundles
 PARALLELS = {PARALLEL, PARALLEL_INDEP, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT}
-TILABLES = {TILABLE, TILABLE_SMALL}
 
 
 def normalize_properties(*args):
@@ -253,16 +253,10 @@ def prefetchable(self, dims, v=PREFETCHABLE):
             m[d] = self.get(d, set()) | {v}
         return Properties(m)
 
-    def block(self, dims, kind='default'):
-        if kind == 'default':
-            p = TILABLE
-        elif kind == 'small':
-            p = TILABLE_SMALL
-        else:
-            raise ValueError
+    def block(self, dims):
         m = dict(self)
         for d in as_tuple(dims):
-            m[d] = set(self.get(d, [])) | {p}
+            m[d] = set(self.get(d, [])) | {TILABLE}
         return Properties(m)
 
     def inbound(self, dims):
@@ -289,6 +283,9 @@ def init_halo_right_shm(self, dims):
                                                  INIT_HALO_LEFT_SHM})
         return properties
 
+    def notune(self, dims):
+        return self.add(dims, NO_TUNING)
+
     def is_parallel(self, dims):
         return any(len(self[d] & {PARALLEL, PARALLEL_INDEP}) > 0
                    for d in as_tuple(dims))
@@ -310,10 +307,7 @@ def is_sequential(self, dims):
         return any(SEQUENTIAL in self.get(d, ()) for d in as_tuple(dims))
 
     def is_blockable(self, d):
-        return bool(self.get(d, set()) & {TILABLE, TILABLE_SMALL})
-
-    def is_blockable_small(self, d):
-        return TILABLE_SMALL in self.get(d, set())
+        return bool(TILABLE in self.get(d, ()))
 
     def _is_property_any(self, dims, v):
         if dims is None:
@@ -335,6 +329,25 @@ def is_halo_right_init(self, dims=None):
     def is_halo_init(self, dims=None):
         return self.is_halo_left_init(dims) or self.is_halo_right_init(dims)
 
+    def avoid_tuning(self, dims):
+        return any(NO_TUNING in self.get(d, set()) for d in as_tuple(dims))
+
+    def subs(self, mapper):
+        return Properties({mapper.get(d, d): v for d, v in self.items()})
+
+    def promote(self, subs):
+        m = self
+        for d, pd in subs.items():
+            if pd not in d._defines:
+                raise ValueError(f"Cannot promote {d} to {pd} as {pd} does not "
+                                 f"belong to {d}'s hierarchy")
+
+            v = normalize_properties(*[self.get(i, set()) for i in d._defines])
+
+            m = self.drop(d._defines).add(pd, v)
+
+        return m
+
     @property
     def nblockable(self):
         return sum([self.is_blockable(d) for d in self])
diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py
index 1c760128b5..96f3e3be07 100644
--- a/devito/ir/support/space.py
+++ b/devito/ir/support/space.py
@@ -478,7 +478,12 @@ def relaxed(self, d=None):
 
     def promote(self, cond, mode='unordered'):
         intervals = [i.promote(cond) for i in self]
-        intervals = IntervalGroup(intervals, relations=None, mode=mode)
+
+        if mode == 'total':
+            relations = [filter_ordered(i.dim for i in intervals)]
+        else:
+            relations = None
+        intervals = IntervalGroup(intervals, relations=relations, mode=mode)
 
         # There could be duplicate Dimensions at this point, so we sum up the
         # Intervals defined over the same Dimension to produce a well-defined
@@ -899,16 +904,29 @@ def augment(self, sub_iterators):
 
         return IterationSpace(self.intervals, items, self.directions)
 
-    def switch(self, d0, d1, direction=None):
-        intervals = self.intervals.switch(d0, d1)
+    def switch(self, d0, d1=None, direction=None):
+        """
+        Construct a new IterationSpace in which the Dimension `d0` is replaced with
+        `d1`. Optionally, `d0` could be a mapper, in which case multiple Dimensions
+        may be switched.
+        """
+        if isinstance(d0, dict):
+            mapper = d0
+        else:
+            mapper = {d0: d1}
 
+        intervals = self.intervals
         sub_iterators = dict(self.sub_iterators)
-        sub_iterators.pop(d0, None)
-        sub_iterators[d1] = ()
-
         directions = dict(self.directions)
-        v = directions.pop(d0, None)
-        directions[d1] = direction or v
+
+        for d0, d1 in mapper.items():
+            intervals = intervals.switch(d0, d1)
+
+            sub_iterators.pop(d0, None)
+            sub_iterators[d1] = ()
+
+            v = directions.pop(d0, None)
+            directions[d1] = direction or v
 
         return IterationSpace(intervals, sub_iterators, directions)
 
diff --git a/devito/ir/support/syncs.py b/devito/ir/support/syncs.py
index 67623e5fe0..d7f4f86bfe 100644
--- a/devito/ir/support/syncs.py
+++ b/devito/ir/support/syncs.py
@@ -175,6 +175,9 @@ def update(self, ops):
             m[d] = set(self.get(d, [])) | set(v)
         return Ops(m)
 
+    def subs(self, mapper):
+        return Ops({mapper.get(d, d): v for d, v in self.items()})
+
     def _get_sync(self, cls, dims=None):
         if dims is None:
             dims = list(self)
diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py
index 20647c2084..0b81e5db9b 100644
--- a/devito/passes/clusters/aliases.py
+++ b/devito/passes/clusters/aliases.py
@@ -5,6 +5,7 @@
 import numpy as np
 import sympy
 
+from devito.arch import Device
 from devito.exceptions import CompilationError
 from devito.finite_differences import EvalDerivative, IndexDerivative, Weights
 from devito.ir import (
@@ -18,12 +19,12 @@
     uxreplace
 )
 from devito.tools import (
-    Stamp, as_mapper, as_tuple, flatten, frozendict, generator, is_integer, split,
-    timed_pass
+    Reconstructable, Stamp, as_mapper, as_tuple, flatten, frozendict, generator,
+    is_integer, split, timed_pass
 )
 from devito.types import (
-    CustomDimension, Eq, Hyperplane, IncrDimension, Indexed, ModuloDimension, Size,
-    StencilDimension, Symbol, Temp, TempArray, TempFunction
+    BlockDimension, CustomDimension, Eq, Hyperplane, IncrDimension, Indexed,
+    ModuloDimension, Size, StencilDimension, Symbol, Temp, TempArray, TempFunction
 )
 from devito.types.grid import MultiSubDimension
 
@@ -112,6 +113,7 @@ def __init__(self, sregistry, options, platform):
         self.sregistry = sregistry
         self.platform = platform
 
+        self.opt_block_temps = options['cire-block-temps']
         self.opt_minstorage = options['min-storage']
         self.opt_rotate = options['cire-rotate']
         self.opt_ftemps = options['cire-ftemps']
@@ -129,7 +131,9 @@ def _aliases_from_clusters(self, cgroup, exclude, meta):
             exprs, aliases = self._choose(found, cgroup, mapper)
 
             # AliasList -> Schedule
-            schedule = lower_aliases(aliases, meta, self.opt_maxpar)
+            schedule = lower_aliases(
+                aliases, meta, self.opt_maxpar, self.opt_block_temps
+            )
 
             variants.append(make_variant(schedule, exprs, mapper))
 
@@ -158,7 +162,8 @@ def _aliases_from_clusters(self, cgroup, exclude, meta):
         if self.opt_multisubdomain:
             processed = optimize_clusters_msds(processed)
 
-        # [Clusters]_k -> [Clusters]_{k+n}
+        # [Clusters]_m -> [Clusters]_{m}
+        rebuilt = []
         for c in cgroup:
             n = len(c.exprs)
             cexprs, exprs = exprs[:n], exprs[n:]
@@ -168,10 +173,16 @@ def _aliases_from_clusters(self, cgroup, exclude, meta):
             ispace = c.ispace.augment(schedule.dmapper)
             ispace = ispace.augment(schedule.rmapper)
 
-            processed.append(c.rebuild(exprs=cexprs, ispace=ispace))
+            rebuilt.append(c.rebuild(exprs=cexprs, ispace=ispace))
 
         assert len(exprs) == 0
 
+        # [Clusters]_m -> [Clusters]_m (optimization)
+        if not self.opt_block_temps:
+            rebuilt = expose_tuning_knobs(rebuilt, self.sregistry)
+
+        processed.extend(rebuilt)
+
         return processed
 
     def process(self, clusters):
@@ -384,10 +395,6 @@ def __init__(self, sregistry, options, platform):
     def process(self, clusters):
         processed = []
         for c in clusters:
-            if not c.is_dense:
-                processed.append(c)
-                continue
-
             # Rule out Dimension-independent dependencies, e.g.:
             # r0 = ...
             # u[x, y] = ... r0*a[x, y] ...
@@ -405,6 +412,9 @@ def process(self, clusters):
         return processed
 
     def _generate(self, cgroup, exclude):
+        if not cgroup.is_dense:
+            return
+
         exprs = cgroup.exprs
 
         # E.g., extract `u.dx*a*b` and `u.dx*a*c` from
@@ -665,97 +675,91 @@ def collect(extracted, meta, minstorage):
     return aliases
 
 
-def lower_aliases(aliases, meta, maxpar):
+def lower_aliases(aliases, meta, opt_maxpar, opt_block_temps):
     """
     Create a Schedule from an AliasList.
     """
-    dmapper = {}
+    dmapper = make_blockdim_subiter(meta.ispace.itdims)
+
     processed = []
     for a in aliases:
-        imapper = {**{i.dim: i for i in a.intervals},
-                   **{i.dim.parent: i for i in a.intervals if i.dim.is_NonlinearDerived}}
+        imapper = {
+            **{i.dim: i for i in a.intervals},
+            **{i.dim.parent: i for i in a.intervals if i.dim.is_NonlinearDerived}
+        }
 
-        intervals = []
         writeto = []
+        intervals = {}
         sub_iterators = {}
         indicess = [[] for _ in a.distances]
         for i in meta.ispace:
+            d = i.dim
+
             try:
-                interval = imapper[i.dim]
+                interval = imapper[d]
             except KeyError:
-                if i.dim in a.free_symbols:
+                if d in a.free_symbols:
                     # Special case: the Dimension appears within the alias but
                     # not as an Indexed index. Then, it needs to be added to
                     # the `writeto` region too
                     interval = i
                 else:
                     # E.g., `x0_blk0` or (`a[y_m+1]` => `y not in imapper`)
-                    intervals.append(i)
+                    intervals[d] = i
                     continue
 
             if not (writeto or
                     interval != interval.zero() or
-                    (maxpar and SEQUENTIAL not in meta.properties.get(i.dim))):
-                # The alias doesn't require a temporary Dimension along i.dim
-                intervals.append(i)
+                    (opt_maxpar and SEQUENTIAL not in meta.properties.get(d))):
+                # The alias doesn't require a temporary Dimension along `d`
+                intervals[d] = i
                 continue
 
-            assert not i.dim.is_NonlinearDerived
+            assert not d.is_NonlinearDerived
 
-            # `i.dim` is necessarily part of the write-to region, so
-            # we have to adjust the Interval's stamp. For example, consider
-            # `i=x[0,0]<1>` and `interval=x[-4,4]<0>`; here we need to
-            # use `<1>` as stamp, which is what appears in `ispace`
+            # `d` is necessarily part of the write-to region, so we have to
+            # adjust the Interval's stamp. For example, consider `i=x[0,0]<1>`
+            # and `interval=x[-4,4]<0>`; here we need to use `<1>` as stamp,
+            # which is what appears in `ispace`
             interval = interval.lift(i.stamp)
 
-            writeto.append(interval)
-            intervals.append(interval)
-
-            if i.dim.is_Block:
-                # Suitable IncrDimensions must be used to avoid OOB accesses.
-                # E.g., r[xs][ys][z] => both `xs` and `ys` must be initialized such
-                # that all accesses are within bounds. This requires traversing the
-                # hierarchy of BlockDimensions to set `xs` (`ys`) in a way that
-                # consecutive blocks access consecutive regions in `r` (e.g.,
-                # `xs=x0_blk1-x0_blk0` with `blocklevels=2`; `xs=0` with
-                # `blocklevels=1`, that is it degenerates in this case)
-                try:
-                    d = dmapper[i.dim]
-                except KeyError:
-                    dd = i.dim.parent
-                    assert dd.is_Block
-                    if dd.parent.is_Block:
-                        # A BlockDimension in between BlockDimensions
-                        m = i.dim.symbolic_min - i.dim.parent.symbolic_min
-                    else:
-                        m = 0
-                    d = dmapper[i.dim] = IncrDimension(
-                        f"{i.dim.name}s", i.dim, m, dd.symbolic_size, 1, dd.step
-                    )
-                sub_iterators[i.dim] = d
+            if opt_block_temps:
+                sub_iterators.update(dmapper)
+                writeto.append(interval)
+                intervals[d] = interval
+            elif d.is_Block:
+                pd = d.parent
+                writeto.append(interval.relaxed)
+                # The lower/upper bounds belong to the parent BlockDimension
+                intervals[d] = interval.zero()
+                intervals[pd] = interval.switch(pd)
             else:
-                d = i.dim
+                writeto.append(interval.relaxed)
+                intervals[d] = interval
+
+            di = sub_iterators.get(d, d)
 
             # Given the iteration `interval`, lower distances to indices
             for distance, indices in zip(a.distances, indicess, strict=True):
-                v = distance[interval.dim] or 0
+                v = distance[i.dim] or 0
                 try:
-                    indices.append(d - interval.lower + v)
+                    indices.append(di - interval.lower + v)
                 except TypeError:
-                    indices.append(d)
+                    indices.append(di)
 
         # The alias write-to space
         writeto = IterationSpace(IntervalGroup(writeto), sub_iterators)
 
         # Avoid scalar aliases in the presence of guards, since hoisting them
-        # might cause scope issues (see `test_dse.py::TestAliases::test_split_cond`)
+        # might cause issues (see `test_dse.py::TestAliases::test_split_cond`)
         if not writeto and meta.guards:
             continue
 
         # The alias iteration space
-        ispace = IterationSpace(IntervalGroup(intervals, meta.ispace.relations),
-                                meta.ispace.sub_iterators,
-                                meta.ispace.directions)
+        ispace = IterationSpace(
+            IntervalGroup(intervals.values(), meta.ispace.relations),
+            meta.ispace.sub_iterators, meta.ispace.directions
+        )
         ispace = ispace.augment(sub_iterators)
 
         # For now, the guards coincide with the original ones, if any
@@ -878,8 +882,8 @@ def optimize_schedule_rotations(schedule, sregistry):
             aispace = aispace.augment({d: mds + [ii]})
             ispace = IterationSpace.union(rispace, aispace, relations={(d, cd, d1)})
 
-            processed.append(ScheduledAlias(
-                pivot, writeto, ispace, i.aliaseds, indicess, i.guards
+            processed.append(i._rebuild(
+                pivot=pivot, writeto=writeto, ispace=ispace, indicess=indicess
             ))
 
         # Update the rotations mapper
@@ -906,10 +910,7 @@ def optimize_schedule_maxpar_basic(schedule):
         writeto = writeto0.lift(dims, stamp)
         ispace = ispace0.lift(dims, stamp)
 
-        processed.extend([
-            ScheduledAlias(pivot, writeto, ispace, aliaseds, indicess, guards)
-            for pivot, _, _, aliaseds, indicess, guards in g
-        ])
+        processed.extend(i._rebuild(writeto=writeto, ispace=ispace) for i in g)
 
     return schedule.rebuild(*processed)
 
@@ -927,31 +928,34 @@ def optimize_schedule_maxpar_compact(schedule):
     ispace_union = IterationSpace.union(*[i.ispace for i in schedule])
 
     processed = []
-    for pivot, writeto, ispace, aliaseds, indicess, guards in schedule:
-        dims = writeto.itdims
+    for sa in schedule:
+        guards = sa.guards
 
         # Do we need to introduce guards to preserve the original semantics?
-        for d in dims:
-            int0 = ispace[d]
-            int1 = ispace_union[d]
+        for i in sa.indices:
+            for d in i._defines:
+                try:
+                    int0, int1 = sa.ispace[d], ispace_union[d]
+                except KeyError:
+                    continue
 
-            if int0 != int1:
-                guard = sympy.And(d >= int0.symbolic_min, d <= int0.symbolic_max)
-                guards = guards.xandg(d, guard)
+                if int0 != int1:
+                    guards = guards.xandg(i, sympy.And(
+                        i >= int0.symbolic_min, i <= int0.symbolic_max
+                    ))
+                    break
 
         # This is conceptually identical to what we do for the `basic` mode
-        writeto = writeto.lift(dims, stamp)
+        dims = sa.writeto.itdims
+        writeto = sa.writeto.lift(dims, stamp)
         ispace = ispace_union.lift(dims, stamp)
 
-        processed.append(ScheduledAlias(
-            pivot, writeto, ispace, aliaseds, indicess, guards
-        ))
+        processed.append(sa._rebuild(writeto=writeto, ispace=ispace, guards=guards))
 
     return schedule.rebuild(*processed)
 
 
-def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype,
-                   opt_minmem):
+def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype, opt_minmem):
     """
     Turn a Schedule into a sequence of Clusters.
     """
@@ -960,28 +964,12 @@ def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype,
 
     clusters = []
     subs = {}
-    for pivot, writeto, ispace, aliaseds, indicess, guards in schedule:
+    for sa in schedule:
+        pivot, writeto, ispace, aliaseds, indicess, guards = sa
+
         name = sregistry.make_name()
-        # Infer the dtype for the pivot
-        # This prevents cases such as `floor(a*b)` with `a` and `b` floats
-        # that would creat a temporary `int r = b` leading to erroneous
-        # numerical results
 
         if writeto:
-            # The Dimensions defining the shape of Array
-            # Note: with SubDimensions, we may have the following situation:
-            #
-            # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ...
-            #   r[zi] = ...
-            #
-            # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll
-            # need as much room as in `zi`'s parent to avoid going OOB Aside
-            # from ugly generated code, the reason we do not rather shift the
-            # indices is that it prevents future passes to transform the loop
-            # bounds (e.g., MPI's comp/comm overlap does that)
-            dimensions = [d.parent if d.is_AbstractSub else d
-                          for d in writeto.itdims]
-
             # The minimum halo required along each Dimension depends on `writeto`.
             # The user might suggest to go more relaxed about this via `opt_minmem`,
             # in which case we extend the halo based on the surrounding
@@ -1002,17 +990,20 @@ def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype,
             shift = [halo[d].left - min_halo[d].left for d in writeto.itdims]
             halo = tuple(halo.values())
 
-            # The indices used to write into the Array
+            # The dimensions of the Array, which together with `halo`, will define
+            # its shape
+            dimensions = sa.dimensions
+
+            # The indices used to populate the Array
             indices = []
-            for i, s in zip(writeto, shift, strict=True):
+            for interval, i, s in zip(writeto, sa.indices, shift, strict=True):
                 try:
                     # E.g., `xs`
-                    sub_iterators = writeto.sub_iterators[i.dim]
-                    assert len(sub_iterators) <= 1
-                    indices.append(sub_iterators[0] + s)
-                except (KeyError, IndexError):
+                    di, = writeto.sub_iterators[i]
+                    indices.append(di + s)
+                except (KeyError, ValueError):
                     # E.g., `z` -- a non-shifted Dimension
-                    indices.append(i.dim - i.lower + s)
+                    indices.append(i - interval.lower + s)
 
             dtype = sympy_dtype(pivot, base=meta.dtype)
             obj = make(name=name, dimensions=dimensions, halo=halo, dtype=dtype,
@@ -1070,21 +1061,34 @@ def optimize_clusters_msds(clusters):
     processed = []
     for c in clusters:
         msds = [d for d in c.ispace.itdims if isinstance(d, MultiSubDimension)]
+        if not msds:
+            processed.append(c)
+            continue
 
-        if msds:
-            mapper = {d: d.root for d in msds}
-            exprs = [uxreplace(e, mapper) for e in c.exprs]
+        mapper = {d: d.root for d in msds}
 
-            ispace = c.ispace.relaxed(msds)
+        processed.append(c.subs(mapper))
 
-            guards = {mapper.get(d, d): v for d, v in c.guards.items()}
-            properties = {mapper.get(d, d): v for d, v in c.properties.items()}
-            syncs = {mapper.get(d, d): v for d, v in c.syncs.items()}
+    return processed
 
-            processed.append(c.rebuild(exprs=exprs, ispace=ispace, guards=guards,
-                                       properties=properties, syncs=syncs))
-        else:
-            processed.append(c)
+
+def expose_tuning_knobs(clusters, sregistry):
+    """
+    Replace all pre-existing BlockDimensions with fresh ones, to enable
+    separate tuning for the CIRE-generated temporaries.
+    """
+    # Create the new BlockDimensions
+    callback = lambda i: sregistry.make_name(prefix=i)
+
+    mapper = {}
+    for d in set().union(*[c.used_dimensions for c in clusters]):
+        if d.is_Block:
+            mapper.update(d._rebuild_hierarchy(callback))
+
+    if not mapper:
+        return clusters
+
+    processed = [c.subs(mapper) for c in clusters]
 
     return processed
 
@@ -1474,18 +1478,64 @@ def is_frame(self):
         return all(i.is_frame for i in self._list)
 
 
-ScheduledAlias = namedtuple('SchedAlias',
-                            'pivot writeto ispace aliaseds indicess guards')
-ScheduledAlias.__new__.__defaults__ = (None, None, None, None, None, None)
+class ScheduledAlias(tuple, Reconstructable):
+
+    __rkwargs__ = ('pivot', 'writeto', 'ispace', 'aliaseds', 'indicess', 'guards')
+
+    def __new__(cls, pivot=None, writeto=None, ispace=None, aliaseds=None,
+                indicess=None, guards=None):
+        obj = super().__new__(cls, (pivot, writeto, ispace, aliaseds, indicess,
+                                    guards))
+
+        assert len(aliaseds) > 0
+        assert len(aliaseds) == len(indicess)
+
+        obj.pivot = pivot
+        obj.writeto = writeto
+        obj.ispace = ispace
+        obj.aliaseds = aliaseds
+        obj.indicess = indicess
+        obj.guards = guards
+
+        return obj
+
+    def __repr__(self):
+        return f"ScheduledAlias<<{self.pivot}>>"
+
+    __str__ = __repr__
+
+    @property
+    def dimensions(self):
+        # Legacy: with SubDimensions, we may have the following situation:
+        #
+        # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ...
+        #   r[zi] = ...
+        #
+        # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll
+        # need as much room as in `zi`'s parent to avoid going out-of-bounds.
+        # Aside from ugly generated code, the reason we do not rather shift the
+        # indices is that it prevents future passes to transform the loop
+        # bounds (e.g., MPI's comp/comm overlap does that)
+        return tuple(d.parent if d.is_AbstractSub else d
+                     for d in self.writeto.itdims)
+
+    @property
+    def indices(self):
+        bdims = [d for d in self.ispace.itdims if d.is_Block]
+        depth = max([d._depth for d in bdims], default=0)
+        mapper = {d.root: d for d in bdims if d._depth == depth}
+        return tuple(mapper.get(d.root, d) for d in self.writeto.itdims)
 
 
 class Schedule(tuple):
 
     def __new__(cls, *items, dmapper=None, rmapper=None, is_frame=False):
         obj = super().__new__(cls, items)
+
         obj.dmapper = dmapper or {}
         obj.rmapper = rmapper or {}
         obj.is_frame = is_frame
+
         return obj
 
     def rebuild(self, *items, dmapper=None, rmapper=None, is_frame=False):
@@ -1634,3 +1684,37 @@ def _(expr):
 @_deindexify.register(Indexed)
 def _(expr):
     return expr.function, {}
+
+
+def make_blockdim_subiter(dimensions):
+    """
+    Create sub-iterators for the BlockDimension in `dimensions`.
+
+    For example, in `r[xs][ys][z]`  both `xs` and `ys` must be initialized such
+    that all accesses are within bounds. This requires traversing the hierarchy
+    of BlockDimensions to set `xs` (`ys`) in a way that consecutive blocks access
+    consecutive regions in `r` (e.g., trivially `xs=0` with `blocklevels=1`;
+    `xs=x0_blk1-x0_blk0` with `blocklevels=2`; and so on).
+    """
+    depth = max([d._depth for d in dimensions if d.is_Block], default=0)
+    if depth <= 1:
+        return {}
+
+    mapper = {}
+    for d in dimensions:
+        if not d.is_Block or d._depth < depth:
+            continue
+
+        pd = d.parent
+
+        if pd.parent.is_Block:
+            # A BlockDimension in between BlockDimensions
+            m = d.symbolic_min - pd.symbolic_min
+        else:
+            m = 0
+
+        mapper[d] = IncrDimension(
+            f"{d.name}s", d, m, pd.symbolic_size, 1, pd.step
+        )
+
+    return mapper
diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py
index e9076bfa85..af659193fc 100644
--- a/devito/passes/clusters/blocking.py
+++ b/devito/passes/clusters/blocking.py
@@ -1,9 +1,11 @@
+from itertools import groupby
+
 from sympy import sympify
 
 from devito.finite_differences.differentiable import IndexSum
 from devito.ir.clusters import Queue
 from devito.ir.support import (
-    AFFINE, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT, SKEWABLE, TILABLES, Interval,
+    AFFINE, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT, SKEWABLE, TILABLE, Interval,
     IntervalGroup, IterationSpace, Scope
 )
 from devito.passes import is_on_device
@@ -14,7 +16,7 @@
 )
 from devito.types import BlockDimension
 
-__all__ = ['blocking']
+__all__ = ['blocking', 'apply_par_tiles']
 
 
 def blocking(clusters, sregistry, options):
@@ -205,12 +207,11 @@ def callback(self, clusters, prefix):
                     # along more than three dimensions
                     return clusters
 
-                if any(self._has_short_trip_count(i) for i in c.ispace.itdims):
-                    properties = c.properties.block(d, 'small')
-                elif self._has_data_reuse(c):
-                    properties = c.properties.block(d)
-                else:
-                    properties = c.properties.block(d, 'small')
+                properties = c.properties.block(d)
+
+                if any(self._has_short_trip_count(i) for i in c.ispace.itdims) or \
+                   not self._has_data_reuse(c):
+                    properties = properties.notune(d)
 
             elif self._has_data_reuse(c):
                 properties = c.properties.block(d)
@@ -239,7 +240,7 @@ def process(self, clusters):
             if c.properties.nblockable > 1:
                 processed.append(c)
             else:
-                properties = c.properties.drop(properties=TILABLES)
+                properties = c.properties.drop(properties=TILABLE)
                 processed.append(c.rebuild(properties=properties))
 
         return processed
@@ -316,13 +317,13 @@ def callback(self, clusters, prefix):
 
 class SynthesizeBlocking(Queue):
 
+    #TODO: DROP?
     template = "%s%d_blk%s"
 
     def __init__(self, sregistry, options):
         self.sregistry = sregistry
 
         self.levels = options['blocklevels']
-        self.par_tile = options['par-tile']
 
         # Track the BlockDimensions created so far so that we can reuse them
         # in case of Clusters that are different but share the same number of
@@ -332,10 +333,7 @@ def __init__(self, sregistry, options):
         super().__init__()
 
     def process(self, clusters):
-        # A tool to unroll the explicit integer block shapes, should there be any
-        blk_size_gen = BlockSizeGenerator(self.par_tile) if self.par_tile else None
-
-        return self._process_fdta(clusters, 1, blk_size_gen=blk_size_gen)
+        return self._process_fdta(clusters, 1)
 
     def _make_key_guards(self, cluster, ispace):
         # Essentially, this will ensure that loop nests within guards *not*
@@ -344,25 +342,18 @@ def _make_key_guards(self, cluster, ispace):
         return tuple(cluster.guards.get(i.dim) for i in ispace
                      if not cluster.properties.is_blockable(i.dim))
 
-    def _derive_block_dims(self, clusters, prefix, d, blk_size_gen):
-        if blk_size_gen is not None:
-            step = sympify(blk_size_gen.next(prefix, d, clusters))
-        else:
-            # This will result in a parametric step, e.g. `x0_blk0_size`
-            step = None
-
+    def _derive_block_dims(self, clusters, prefix, d):
         # Can I reuse existing BlockDimensions to avoid a proliferation of steps?
         k = stencil_footprint(clusters, d)
-        if step is None:
-            try:
-                return self.mapper[k]
-            except KeyError:
-                pass
+        try:
+            return self.mapper[k]
+        except KeyError:
+            pass
 
         base = self.sregistry.make_name(prefix=d.root.name)
 
         name = self.sregistry.make_name(prefix=f"{base}_blk")
-        bd = BlockDimension(name, d, d.symbolic_min, d.symbolic_max, step)
+        bd = BlockDimension(name, d, d.symbolic_min, d.symbolic_max)
         step = bd.step
         block_dims = [bd]
 
@@ -378,7 +369,7 @@ def _derive_block_dims(self, clusters, prefix, d, blk_size_gen):
 
         return retval
 
-    def callback(self, clusters, prefix, blk_size_gen=None):
+    def callback(self, clusters, prefix):
         if not prefix:
             return clusters
 
@@ -388,9 +379,7 @@ def callback(self, clusters, prefix, blk_size_gen=None):
             return clusters
 
         try:
-            block_dims, bd = self._derive_block_dims(
-                clusters, prefix, d, blk_size_gen
-            )
+            block_dims, bd = self._derive_block_dims(clusters, prefix, d)
         except StopIteration:
             return clusters
 
@@ -408,7 +397,7 @@ def callback(self, clusters, prefix, blk_size_gen=None):
 
                 # The new Cluster properties -- TILABLE is dropped after blocking
                 properties = c.properties.drop(d)
-                properties = properties.add(block_dims, c.properties[d] - TILABLES)
+                properties = properties.add(block_dims, c.properties[d] - {TILABLE})
 
                 processed.append(c.rebuild(exprs=exprs, ispace=ispace,
                                            guards=guards, properties=properties))
@@ -508,7 +497,6 @@ class BlockSizeGenerator:
     """
 
     def __init__(self, par_tile):
-        self.tip = -1
         self.umt = par_tile
 
         if par_tile.is_multi:
@@ -537,56 +525,25 @@ def __init__(self, par_tile):
             else:
                 self.umt_reduce = UnboundTuple(*par_tile.default, 1)
 
-    def next(self, prefix, d, clusters):
-        # If a whole new set of Dimensions, move the tip -- this means `clusters`
-        # at `d` represents a new loop nest or kernel
-        x = any(i.dim.is_Block for i in flatten(c.ispace for c in clusters))
-        if not x:
-            self.tip += 1
-
-        # Correctness -- enforce blocking where necessary.
-        # See also issue #276:PRO
-        if any(c.properties.is_parallel_atomic(d) for c in clusters):
+    def schedule(self, dims, clusters):
+        if any(c.properties.is_parallel_atomic(dims) for c in clusters):
+            # Correctness -- enforce blocking where necessary.
+            # See also issue #276:PRO
             if any(c.is_sparse for c in clusters):
-                if not x:
-                    self.umt_sparse.iter()
-                return self.umt_sparse.next()
-            else:
-                if not x:
-                    self.umt_reduce.iter()
-                return self.umt_reduce.next()
-
-        # Performance heuristics -- use a smaller par-tile
-        if all(c.properties.is_blockable_small(d) for c in clusters):
-            if not x:
-                self.umt_small.iter()
-            return self.umt_small.next()
-
-        # We can't `self.umt.iter()` because we might still want to
-        # fallback to `self.umt_small`
-        item = self.umt.curitem() if x else self.umt.nextitem()
-
-        # Handle user-provided rules
-        # TODO: This is also rudimentary
-        if item.rule is None:
-            umt = self.umt
-        elif is_integer(item.rule):
-            if item.rule == self.tip:
-                umt = self.umt
+                umt = self.umt_sparse
             else:
-                umt = self.umt_small
-                if not x:
-                    umt.iter()
+                umt = self.umt_reduce
+
+        elif all(c.properties.avoid_tuning(dims) for c in clusters):
+            # Performance heuristics -- use a smaller par-tile
+            umt = self.umt_small
+
         else:
-            if item.rule in {d.name for d in prefix.itdims}:
-                umt = self.umt
-            else:
-                # This is like "pattern unmatched" -- fallback to `umt_small`
-                umt = self.umt_small
-                if not x:
-                    umt.iter()
+            umt = self.umt
+
+        umt.iter()
 
-        return umt.next()
+        return umt
 
 
 class SynthesizeSkewing(Queue):
@@ -661,3 +618,41 @@ def callback(self, clusters, prefix):
             processed.append(c.rebuild(exprs=exprs, ispace=ispace))
 
         return processed
+
+
+def apply_par_tiles(clusters, options, **kwargs):
+    """
+    Use the par-tile parameter to replace the symbolic BlockDimension sizes
+    with actual integer numbers representing the block shape.
+    """
+    if not options['par-tile']:
+        return clusters
+
+    blk_size_gen = BlockSizeGenerator(options['par-tile'])
+
+    key0 = lambda d: d.is_Block and d._depth == 2
+    key = lambda c: c.ispace.project(key0).itdims
+
+    processed = []
+    for bdims, group in groupby(clusters, key=key):
+        g = list(group)
+
+        if not bdims:
+            processed.extend(g)
+            continue
+
+        umt = blk_size_gen.schedule(bdims, g)
+
+        subs = {}
+        compact = []
+        for d in reversed(bdims):
+            try:
+                step = sympify(umt.next())
+                subs.update(d._rebuild_hierarchy(step=step))
+            except StopIteration:
+                # No more par-tile sizes available, so we drop the BlockDimensions
+                compact.append(d)
+
+        processed.extend(c.subs(subs, compact) for c in g)
+
+    return processed
diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py
index 9b936fba76..0df11d8afb 100644
--- a/devito/passes/iet/engine.py
+++ b/devito/passes/iet/engine.py
@@ -624,20 +624,10 @@ def _(i, mapper, sregistry):
     if i._depth != 2:
         return
 
-    p = i.parent
-    pp = i.parent.parent
+    callback = lambda i: sregistry.make_name(prefix=i)
+    subs = i._rebuild_hierarchy(callback)
 
-    name0 = pp.name
-    base = sregistry.make_name(prefix=name0)
-    name1 = sregistry.make_name(prefix=f'{base}_blk')
-
-    bd = i.parent._rebuild(name1, pp)
-    d = i._rebuild(name0, bd, i._min.subs(p, bd), i._max.subs(p, bd))
-
-    mapper.update({
-        i: d,
-        i.parent: bd
-    })
+    mapper.update(subs)
 
 
 @abstract_object.register(IncrDimension)
diff --git a/devito/types/dimension.py b/devito/types/dimension.py
index 41c1f90b88..665dda08d5 100644
--- a/devito/types/dimension.py
+++ b/devito/types/dimension.py
@@ -510,6 +510,16 @@ def __init_finalize__(self, name, parent):
         self.is_Time = parent.is_Time
         self.is_Space = parent.is_Space
 
+    def _rebuild_hierarchy(self, callback):
+        """
+        Recursively rebuild the hierarchy of DerivedDimensions rooted at `self`.
+
+        To be overridden by subclasses, since different DerivedDimensions may
+        have different requirements, and it'd be cumbersome to generalise it
+        all here.
+        """
+        raise NotImplementedError
+
     @property
     def parent(self):
         return self._parent
@@ -1319,6 +1329,34 @@ class BlockDimension(AbstractIncrDimension):
     is_Block = True
     is_PerfKnob = True
 
+    def _rebuild_hierarchy(self, callback=None, step=None):
+        if self._depth != 2:
+            raise NotImplementedError
+
+        p = self.parent
+        pp = self.parent.parent
+
+        name0 = pp.name
+
+        if callback is None:
+            name1 = p.name
+        else:
+            base = callback(name0)
+            name1 = callback(f'{base}_blk')
+
+        bd = p._rebuild(name1, pp, step=step or p.step)
+
+        subs = {p: bd}
+        if step is not None:
+            subs[p.step] = step
+
+        d = self._rebuild(
+            name0, bd,
+            self._min.subs(subs), self._max.subs(subs), size=self.size.subs(subs)
+        )
+
+        return {self: d, p: bd}
+
     @cached_property
     def _arg_names(self):
         try:
diff --git a/tests/test_dle.py b/tests/test_dle.py
index 2fd090b310..e3edbcad46 100644
--- a/tests/test_dle.py
+++ b/tests/test_dle.py
@@ -376,62 +376,6 @@ def test_structure_2p5D(self):
             assert iters[0].step == par_tile[1]
             assert iters[1].step == par_tile[0]
 
-    def test_custom_rule0(self):
-        grid = Grid(shape=(8, 8, 8))
-
-        u = TimeFunction(name="u", grid=grid, space_order=4)
-        v = TimeFunction(name="v", grid=grid, space_order=4)
-
-        eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx),
-                Eq(v.forward, u.forward.dx)]
-
-        # "Apply par-tile=(4, 4, 4) to the loop nest (kernel) with id (rule)=1,
-        # and use default for the rest!"
-        par_tile = (4, 4, 4)
-        rule = 1
-
-        op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule),
-                                                  'blockinner': True}))
-
-        # Check generated code. By having specified "1" as rule, we expect the
-        # given par-tile to be applied to the kernel with id 1
-        bns, _ = assert_blocking(op, {'z0_blk0', 'x0_blk0', 'z2_blk0'})
-        root = bns['x0_blk0']
-        iters = FindNodes(Iteration).visit(root)
-        iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1]
-        assert len(iters) == 3
-        assert all(i.step == j for i, j in zip(iters, par_tile, strict=True))
-
-    def test_custom_rule1(self):
-        grid = Grid(shape=(8, 8, 8))
-        x, y, z = grid.dimensions
-
-        f = Function(name='f', grid=grid)
-        u = TimeFunction(name="u", grid=grid, space_order=4)
-        v = TimeFunction(name="v", grid=grid, space_order=4)
-
-        eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx + cos(f)*cos(f[x+1, y, z])),
-                Eq(v.forward, u.forward.dx)]
-
-        # "Apply par-tile=(4, 4, 4) to the loop nests (kernels) embedded within
-        # the time loop, and use default for the rest!"
-        par_tile = (4, 4, 4)
-        rule = grid.time_dim.name  # We must be able to infer it from str
-
-        op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule),
-                                                  'blockinner': True,
-                                                  'blockrelax': True}))
-
-        # Check generated code. By having specified "time" as rule, we expect the
-        # given par-tile to be applied to the kernel within the time loop
-        bns, _ = assert_blocking(op, {'x0_blk0', 'x1_blk0', 'x2_blk0'})
-        for i in ['x0_blk0', 'x1_blk0', 'x2_blk0']:
-            root = bns[i]
-            iters = FindNodes(Iteration).visit(root)
-            iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1]
-            assert len(iters) == 3
-            assert all(i.step == j for i, j in zip(iters, par_tile, strict=True))
-
 
 @pytest.mark.parametrize("shape", [(10,), (10, 45), (20, 33), (10, 31, 45), (45, 31, 45)])
 @pytest.mark.parametrize("time_order", [2])
diff --git a/tests/test_dse.py b/tests/test_dse.py
index 9c3335c8af..dca6754b6c 100644
--- a/tests/test_dse.py
+++ b/tests/test_dse.py
@@ -2868,6 +2868,28 @@ def test_scalar_with_cond_tinvariant(self):
         assert str(body0(op).body[0]) == 'const float r0 = 1.0F/dt;'
         assert not body0(op).body[0].ispace
 
+    def test_block_temps_false(self):
+        grid = Grid(shape=(34, 45, 50))
+
+        a = Function(name='a', grid=grid, space_order=8)
+        u = TimeFunction(name='u', grid=grid, space_order=8)
+        v = u.func(name='v')
+
+        eqn = Eq(v.forward, a.dy.dz*u.dx.dy + v)
+
+        op = Operator(eqn, opt=('advanced', {'openmp': False,
+                                             'cire-block-temps': False,
+                                             'cire-maxpar': 'basic'}))
+
+        # Ensure it executes smoothly
+        op.apply(time_M=3)
+
+        assert_structure(
+            op,
+            ['t,x0_blk0,y0_blk0,x,y,z', 't,x1_blk0,y1_blk0,x,y,z'],
+            'tx0_blk0y0_blk0xyzx1_blk0y1_blk0xyz'
+        )
+
 
 class TestIsoAcoustic:
 

From ff9b90636a22763b39227e7135a04b4387ab9a00 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Mon, 15 Jun 2026 15:43:10 +0100
Subject: [PATCH 7/8] compiler: Drop unused SynthesizeBlocking.template

---
 devito/passes/clusters/aliases.py  | 5 ++---
 devito/passes/clusters/blocking.py | 3 ---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py
index 0b81e5db9b..47ddd7cf89 100644
--- a/devito/passes/clusters/aliases.py
+++ b/devito/passes/clusters/aliases.py
@@ -5,7 +5,6 @@
 import numpy as np
 import sympy
 
-from devito.arch import Device
 from devito.exceptions import CompilationError
 from devito.finite_differences import EvalDerivative, IndexDerivative, Weights
 from devito.ir import (
@@ -23,8 +22,8 @@
     is_integer, split, timed_pass
 )
 from devito.types import (
-    BlockDimension, CustomDimension, Eq, Hyperplane, IncrDimension, Indexed,
-    ModuloDimension, Size, StencilDimension, Symbol, Temp, TempArray, TempFunction
+    CustomDimension, Eq, Hyperplane, IncrDimension, Indexed, ModuloDimension, Size,
+    StencilDimension, Symbol, Temp, TempArray, TempFunction
 )
 from devito.types.grid import MultiSubDimension
 
diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py
index af659193fc..29f9791019 100644
--- a/devito/passes/clusters/blocking.py
+++ b/devito/passes/clusters/blocking.py
@@ -317,9 +317,6 @@ def callback(self, clusters, prefix):
 
 class SynthesizeBlocking(Queue):
 
-    #TODO: DROP?
-    template = "%s%d_blk%s"
-
     def __init__(self, sregistry, options):
         self.sregistry = sregistry
 

From f3c20e8d6b34edfb8dbefe131b88201fb431c3a8 Mon Sep 17 00:00:00 2001
From: Fabio Luporini <fabio@devitocodes.com>
Date: Tue, 16 Jun 2026 10:24:47 +0100
Subject: [PATCH 8/8] compiler: Comply with ruff

---
 devito/ir/support/space.py         | 5 +----
 devito/passes/clusters/aliases.py  | 2 +-
 devito/passes/clusters/blocking.py | 2 +-
 tests/test_dimension.py            | 2 +-
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py
index 96f3e3be07..1efafe5a98 100644
--- a/devito/ir/support/space.py
+++ b/devito/ir/support/space.py
@@ -910,10 +910,7 @@ def switch(self, d0, d1=None, direction=None):
         `d1`. Optionally, `d0` could be a mapper, in which case multiple Dimensions
         may be switched.
         """
-        if isinstance(d0, dict):
-            mapper = d0
-        else:
-            mapper = {d0: d1}
+        mapper = d0 if isinstance(d0, dict) else {d0: d1}
 
         intervals = self.intervals
         sub_iterators = dict(self.sub_iterators)
diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py
index 47ddd7cf89..aa5e955b6d 100644
--- a/devito/passes/clusters/aliases.py
+++ b/devito/passes/clusters/aliases.py
@@ -1706,7 +1706,7 @@ def make_blockdim_subiter(dimensions):
 
         pd = d.parent
 
-        if pd.parent.is_Block:
+        if pd.parent.is_Block:  # noqa SIM108
             # A BlockDimension in between BlockDimensions
             m = d.symbolic_min - pd.symbolic_min
         else:
diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py
index 29f9791019..3a5636a91a 100644
--- a/devito/passes/clusters/blocking.py
+++ b/devito/passes/clusters/blocking.py
@@ -16,7 +16,7 @@
 )
 from devito.types import BlockDimension
 
-__all__ = ['blocking', 'apply_par_tiles']
+__all__ = ['apply_par_tiles', 'blocking']
 
 
 def blocking(clusters, sregistry, options):
diff --git a/tests/test_dimension.py b/tests/test_dimension.py
index 665a31af77..c8de790431 100644
--- a/tests/test_dimension.py
+++ b/tests/test_dimension.py
@@ -2072,7 +2072,7 @@ def test_blocking_w_guard(self):
 
         op = Operator(eqns, opt=('advanced', {'openmp': True}))
 
-        op.cfunction
+        _ = op.cfunction
 
         assert_structure(op, ['t,x0_blk0,y0_blk0,x,y,z',
                               't,x0_blk0,y0_blk0,x,y',