From 140b3a53c7574790fe23d40e58781e6d0232be68 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 19 May 2026 15:46:02 +0100 Subject: [PATCH 1/8] compiler: Attach deriv_order to IndexDerivative --- devito/finite_differences/differentiable.py | 12 ++++++++++-- devito/finite_differences/finite_difference.py | 4 +++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py index 6322b8ca8f..cfaa31ebc7 100644 --- a/devito/finite_differences/differentiable.py +++ b/devito/finite_differences/differentiable.py @@ -985,8 +985,9 @@ def value(self, idx): class IndexDerivative(IndexSum): __rargs__ = ('expr', 'mapper') + __rkwargs__ = IndexSum.__rkwargs__ + ('deriv_order',) - def __new__(cls, expr, mapper, **kwargs): + def __new__(cls, expr, mapper, deriv_order=None, **kwargs): dimensions = as_tuple(set(mapper.values())) # Detect the Weights among the arguments @@ -1008,6 +1009,8 @@ def __new__(cls, expr, mapper, **kwargs): obj._weights = weights obj._mapper = frozendict(mapper) + obj._deriv_order = deriv_order + return obj def _hashable_content(self): @@ -1036,6 +1039,10 @@ def weights(self): def mapper(self): return self._mapper + @property + def deriv_order(self): + return self._deriv_order + @property def depth(self): iderivs = self.expr.find(IndexDerivative) @@ -1212,7 +1219,8 @@ def _diff2sympy(obj): # Handle special objects if isinstance(obj, DiffDerivative): - return IndexDerivative(*args, obj.mapper), True + return IndexDerivative(*args, obj.mapper, + deriv_order=obj.deriv_order), True # Handle generic objects such as arithmetic operations try: diff --git a/devito/finite_differences/finite_difference.py b/devito/finite_differences/finite_difference.py index 30199fb3d8..33c1a95a30 100644 --- a/devito/finite_differences/finite_difference.py +++ b/devito/finite_differences/finite_difference.py @@ -209,7 +209,9 @@ def make_derivative(expr, dim, fd_order, deriv_order, side, matvec, x0, coeffici with suppress(AttributeError): expr = expr._evaluate(expand=False) - deriv = DiffDerivative(expr*weights, {dim: indices.free_dim}) + deriv = DiffDerivative( + expr*weights, {dim: indices.free_dim}, deriv_order=deriv_order + ) else: terms = [] for i, c in zip(indices, weights, strict=True): From 83c510d21df7d3d1b83cd31307c233c83262d7fc Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 25 May 2026 11:20:05 +0100 Subject: [PATCH 2/8] compiler: Add cire-maxpar=compact mode --- devito/core/gpu.py | 2 +- devito/core/operator.py | 3 ++ devito/passes/clusters/aliases.py | 68 +++++++++++++++++++++++++------ tests/test_dse.py | 13 +++--- 4 files changed, 67 insertions(+), 19 deletions(-) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index bd13623441..06cb42eb37 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -78,7 +78,7 @@ def _normalize_kwargs(cls, **kwargs): # CIRE o['min-storage'] = False o['cire-rotate'] = False - o['cire-maxpar'] = oo.pop('cire-maxpar', True) + o['cire-maxpar'] = oo.pop('cire-maxpar', 'basic') o['cire-ftemps'] = oo.pop('cire-ftemps', False) o['cire-mingain'] = oo.pop('cire-mingain', cls.CIRE_MINGAIN) o['cire-minmem'] = oo.pop('cire-minmem', cls.CIRE_MINMEM) diff --git a/devito/core/operator.py b/devito/core/operator.py index 8a52cbf98a..16c403ba79 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -240,6 +240,9 @@ def _check_kwargs(cls, **kwargs): if oo['mpi'] and oo['mpi'] not in cls.MPI_MODES: raise InvalidOperator(f"Unsupported MPI mode `{oo['mpi']}`") + if oo['cire-maxpar'] not in (False, 'basic', 'compact'): + raise InvalidOperator("Illegal `cire-maxpar` value") + if oo['cse-algo'] not in ('basic', 'smartsort', 'advanced'): raise InvalidOperator("Illegal `cse-algo` value") diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 58078847a2..20647c2084 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -140,8 +140,10 @@ def _aliases_from_clusters(self, cgroup, exclude, meta): schedule, exprs = self._select(variants) # Schedule -> Schedule (optimization) - if self.opt_maxpar: - schedule = optimize_schedule_maxpar(schedule) + if self.opt_maxpar == 'basic': + schedule = optimize_schedule_maxpar_basic(schedule) + elif self.opt_maxpar == 'compact': + schedule = optimize_schedule_maxpar_compact(schedule) # Schedule -> Schedule (optimization) if self.opt_rotate: @@ -285,7 +287,7 @@ class CireInvariants(CireTransformerLegacy, Queue): def __init__(self, sregistry, options, platform): super().__init__(sregistry, options, platform) - self.opt_maxpar = True + self.opt_maxpar = 'basic' self.opt_schedule_strategy = None def process(self, clusters): @@ -756,8 +758,11 @@ def lower_aliases(aliases, meta, maxpar): meta.ispace.directions) ispace = ispace.augment(sub_iterators) + # For now, the guards coincide with the original ones, if any + guards = meta.guards + processed.append( - ScheduledAlias(a.pivot, writeto, ispace, a.aliaseds, indicess) + ScheduledAlias(a.pivot, writeto, ispace, a.aliaseds, indicess, guards) ) # The [ScheduledAliases] must be ordered so as to reuse as many of the @@ -874,7 +879,7 @@ def optimize_schedule_rotations(schedule, sregistry): ispace = IterationSpace.union(rispace, aispace, relations={(d, cd, d1)}) processed.append(ScheduledAlias( - pivot, writeto, ispace, i.aliaseds, indicess, + pivot, writeto, ispace, i.aliaseds, indicess, i.guards )) # Update the rotations mapper @@ -883,9 +888,11 @@ def optimize_schedule_rotations(schedule, sregistry): return schedule.rebuild(*processed, rmapper=rmapper) -def optimize_schedule_maxpar(schedule): +def optimize_schedule_maxpar_basic(schedule): """ - Bump the IterationSpace' stamp trading fusion for more collapse-parallelism. + Trade fusion for more collapse-parallelism. This is more conservative than + `optimize_schedule_maxpar_compact`, since if two IterationSpaces have + different endpoints along the same Dimensions, they won't be fused together. """ key = lambda i: (i.writeto, i.ispace) @@ -900,13 +907,49 @@ def optimize_schedule_maxpar(schedule): ispace = ispace0.lift(dims, stamp) processed.extend([ - ScheduledAlias(pivot, writeto, ispace, aliaseds, indicess) - for pivot, _, _, aliaseds, indicess in g + ScheduledAlias(pivot, writeto, ispace, aliaseds, indicess, guards) + for pivot, _, _, aliaseds, indicess, guards in g ]) return schedule.rebuild(*processed) +def optimize_schedule_maxpar_compact(schedule): + """ + Aggressively trade fusion for more collapse-parallelism. Unlike + `optimize_schedule_maxpar_basic`, this will fuse together IterationSpaces with + different endpoints along the same Dimensions by introducing guards to + preserve the original semantics. + """ + stamp = Stamp() + + # The union IterationSpace will enable maximal fusion + ispace_union = IterationSpace.union(*[i.ispace for i in schedule]) + + processed = [] + for pivot, writeto, ispace, aliaseds, indicess, guards in schedule: + dims = writeto.itdims + + # Do we need to introduce guards to preserve the original semantics? + for d in dims: + int0 = ispace[d] + int1 = ispace_union[d] + + if int0 != int1: + guard = sympy.And(d >= int0.symbolic_min, d <= int0.symbolic_max) + guards = guards.xandg(d, guard) + + # This is conceptually identical to what we do for the `basic` mode + writeto = writeto.lift(dims, stamp) + ispace = ispace_union.lift(dims, stamp) + + processed.append(ScheduledAlias( + pivot, writeto, ispace, aliaseds, indicess, guards + )) + + return schedule.rebuild(*processed) + + def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype, opt_minmem): """ @@ -917,7 +960,7 @@ def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype, clusters = [] subs = {} - for pivot, writeto, ispace, aliaseds, indicess in schedule: + for pivot, writeto, ispace, aliaseds, indicess, guards in schedule: name = sregistry.make_name() # Infer the dtype for the pivot # This prevents cases such as `floor(a*b)` with `a` and `b` floats @@ -1013,7 +1056,7 @@ def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype, properties[Hyperplane(writeto.itdims)] = {SEPARABLE} # Finally, build the alias Cluster - clusters.append(Cluster(expression, ispace, meta.guards, properties)) + clusters.append(Cluster(expression, ispace, guards, properties)) return clusters, subs @@ -1432,7 +1475,8 @@ def is_frame(self): ScheduledAlias = namedtuple('SchedAlias', - 'pivot writeto ispace aliaseds indicess') + 'pivot writeto ispace aliaseds indicess guards') +ScheduledAlias.__new__.__defaults__ = (None, None, None, None, None, None) class Schedule(tuple): diff --git a/tests/test_dse.py b/tests/test_dse.py index 3dcbd40336..9c3335c8af 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -2164,7 +2164,7 @@ def test_derivatives_from_different_levels(self): @pytest.mark.parametrize('rotate', [False, True]) def test_maxpar_option(self, rotate): """ - Test the compiler option `cire-maxpar=True`. + Test the compiler option `cire-maxpar='basic'`. """ grid = Grid(shape=(10, 10, 10)) @@ -2179,7 +2179,8 @@ def test_maxpar_option(self, rotate): eq = Eq(u.forward, f*u.dy.dy) op0 = Operator(eq, opt='noop') - op1 = Operator(eq, opt=('advanced', {'cire-maxpar': True, 'cire-rotate': rotate})) + op1 = Operator(eq, opt=('advanced', {'cire-maxpar': 'basic', + 'cire-rotate': rotate})) # Check code generation bns, _ = assert_blocking(op1, {'x0_blk0'}) @@ -2248,7 +2249,7 @@ def test_multiple_rotating_dims(self): def test_maxpar_option_v2(self): """ - Another test for the compiler option `cire-maxpar=True`. + Another test for the compiler option `cire-maxpar='basic'`. """ grid = Grid(shape=(10, 10, 10)) @@ -2263,7 +2264,7 @@ def test_maxpar_option_v2(self): eq = Eq(u.forward, f*u.dx.dx) op0 = Operator(eq, opt='noop') - op1 = Operator(eq, opt=('advanced', {'cire-maxpar': True})) + op1 = Operator(eq, opt=('advanced', {'cire-maxpar': 'basic'})) # Check code generation bns, _ = assert_blocking(op1, {'x0_blk0'}) @@ -2279,7 +2280,7 @@ def test_maxpar_option_v2(self): def test_maxpar_option_v3(self): """ - Another test for the compiler option `cire-maxpar=True`. + Another test for the compiler option `cire-maxpar='basic'`. """ grid = Grid(shape=(10, 10)) @@ -2288,7 +2289,7 @@ def test_maxpar_option_v3(self): eq = Eq(u.forward, u.dx.dx + v.dx.dy) - op = Operator(eq, opt=('advanced', {'cire-maxpar': True})) + op = Operator(eq, opt=('advanced', {'cire-maxpar': 'basic'})) # Check code generation xs, ys = get_params(op, 'x_size', 'y_size') From 1ef0ada99da2f868222f14a031736ec2df30811a Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 25 May 2026 11:20:28 +0100 Subject: [PATCH 3/8] compiler: Change blocking strategy in presence of guards --- devito/ir/clusters/visitors.py | 21 ++++++++++++--------- devito/passes/clusters/blocking.py | 13 ++++++++++--- tests/test_dimension.py | 21 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/devito/ir/clusters/visitors.py b/devito/ir/clusters/visitors.py index 11bcad5365..371ed4e406 100644 --- a/devito/ir/clusters/visitors.py +++ b/devito/ir/clusters/visitors.py @@ -37,15 +37,7 @@ def _make_key(self, cluster, level): assert self._q_ispace_in_key ispace = cluster.ispace[:level] - if self._q_guards_in_key: - try: - guards = tuple(cluster.guards.get(i.dim) for i in ispace) - except AttributeError: - # `cluster` is actually a ClusterGroup - assert len(cluster.guards) == 1 - guards = tuple(cluster.guards[0].get(i.dim) for i in ispace) - else: - guards = None + guards = self._make_key_guards(cluster, ispace) if self._q_properties_in_key: properties = cluster.properties.drop(cluster.ispace[level:].itdims) @@ -68,6 +60,17 @@ def _make_key(self, cluster, level): return (prefix,) + subkey + def _make_key_guards(self, cluster, ispace): + if not self._q_guards_in_key: + return None + + try: + return tuple(cluster.guards.get(i.dim) for i in ispace) + except AttributeError: + # `cluster` is actually a ClusterGroup + assert len(cluster.guards) == 1 + return tuple(cluster.guards[0].get(i.dim) for i in ispace) + def _make_key_hook(self, cluster, level): return () diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index d04962a52c..e9076bfa85 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -316,8 +316,6 @@ def callback(self, clusters, prefix): class SynthesizeBlocking(Queue): - _q_guards_in_key = True - template = "%s%d_blk%s" def __init__(self, sregistry, options): @@ -339,6 +337,13 @@ def process(self, clusters): return self._process_fdta(clusters, 1, blk_size_gen=blk_size_gen) + def _make_key_guards(self, cluster, ispace): + # Essentially, this will ensure that loop nests within guards *not* + # pertaining to the blockable Dimension are given a dedicated set of + # BlockDimensions, e.g. see `test_dimension.py::test_no_fusion_convoluted` + return tuple(cluster.guards.get(i.dim) for i in ispace + if not cluster.properties.is_blockable(i.dim)) + def _derive_block_dims(self, clusters, prefix, d, blk_size_gen): if blk_size_gen is not None: step = sympify(blk_size_gen.next(prefix, d, clusters)) @@ -397,7 +402,9 @@ def callback(self, clusters, prefix, blk_size_gen=None): # Use the innermost BlockDimension in place of `d` subs = {d: bd} exprs = [uxreplace(e, subs) for e in c.exprs] - guards = {subs.get(i, i): v for i, v in c.guards.items()} + guards = { + subs.get(i, i): uxreplace(v, subs) for i, v in c.guards.items() + } # The new Cluster properties -- TILABLE is dropped after blocking properties = c.properties.drop(d) diff --git a/tests/test_dimension.py b/tests/test_dimension.py index 713378fb4f..665a31af77 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -2058,6 +2058,27 @@ def test_factor_and_condition(self): for t in range(buffer_size): assert np.all(usaved.data[t] == t*factor + bounds[0] - 1) + def test_blocking_w_guard(self): + grid = Grid(shape=(8, 8, 8)) + x, y, z = grid.dimensions + + u = TimeFunction(name="u", grid=grid, space_order=2) + v = u.func(name="v") + + cdim = ConditionalDimension(name="cdim", parent=y, condition=Ge(y, 4)) + + eqns = [Eq(u.forward, u.dx + v.dy + 1), + Eq(v.forward, v.dx + u.dy + 1, implicit_dims=cdim)] + + op = Operator(eqns, opt=('advanced', {'openmp': True})) + + op.cfunction + + assert_structure(op, ['t,x0_blk0,y0_blk0,x,y,z', + 't,x0_blk0,y0_blk0,x,y', + 't,x0_blk0,y0_blk0,x,y,z'], + 't,x0_blk0,y0_blk0,x,y,z,z') + class TestCustomDimension: From ca96cdf58bb775a141e9010d1d4bd46593c445ce Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 1 Jun 2026 08:44:32 +0100 Subject: [PATCH 4/8] tests: Remove useless sympy.Float cast --- tests/test_symbolics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_symbolics.py b/tests/test_symbolics.py index aab8502a20..5529870e5e 100644 --- a/tests/test_symbolics.py +++ b/tests/test_symbolics.py @@ -579,7 +579,7 @@ def test_component_access_symbol_printing(): acc = dSymbol(name='acc', dtype=dtypes_vector_mapper[(np.float32, 4)]) expr = ComponentAccess(acc, 0) - assert ccode(sympy.Float('1.25')*expr, dtype=expr.dtype) == '1.250F*acc.x' + assert ccode(1.25*expr, dtype=expr.dtype) == '1.250F*acc.x' def test_vector_access(): From 3de03cafbd4040262c5d37d0048286f707fbceb1 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 3 Jun 2026 11:38:35 +0100 Subject: [PATCH 5/8] compiler: Fix DDA involving TensorMoves --- devito/ir/support/basic.py | 30 +++++++++++++++++++-------- devito/symbolics/extended_sympy.py | 11 +++++++++- devito/types/parallel.py | 33 +++++++++++++++++++++++++----- 3 files changed, 59 insertions(+), 15 deletions(-) diff --git a/devito/ir/support/basic.py b/devito/ir/support/basic.py index 7939ee8fe8..6d91981e0a 100644 --- a/devito/ir/support/basic.py +++ b/devito/ir/support/basic.py @@ -15,11 +15,11 @@ ) from devito.tools import ( CacheInstances, Tag, as_mapper, as_tuple, filter_sorted, flatten, is_integer, - memoized_generator, memoized_meth, smart_gt, smart_lt + memoized_generator, memoized_meth, smart_gt, smart_lt, split ) from devito.types import ( ComponentAccess, CriticalRegion, Dimension, DimensionTuple, Fence, Function, Symbol, - TBArray, Temp, TempArray + TBArray, Temp, TempArray, TensorMove ) __all__ = ['ExprGeometry', 'IterationInstance', 'Scope', 'TimedAccess'] @@ -1383,19 +1383,31 @@ def vinf(entries): def retrieve_accesses(exprs, **kwargs): """ - Like retrieve_terminals, but ensure that if a ComponentAccess is found, - the ComponentAccess itself is returned, while the wrapped Indexed is discarded. + Similar to `retrieve_terminals`, but with some adjustments: + + * ComponentAccess's are retained, but the wrapped Indexed are discarded; + * TensorMove's are upcasted to the logical Indexed they represent. """ kwargs['mode'] = 'unique' compaccs = search(exprs, ComponentAccess) - if not compaccs: - return retrieve_terminals(exprs, **kwargs) - subs = {i: Symbol(f'dummy{n}') for n, i in enumerate(compaccs)} - exprs1 = uxreplace(exprs, subs) + if compaccs: + # Handle ComponentAccesses + subs = {i: Symbol(f'dummy{n}') for n, i in enumerate(compaccs)} + exprs1 = uxreplace(exprs, subs) + terms1 = retrieve_terminals(exprs1, **kwargs) + + accesses = compaccs | terms1 - set(subs.values()) + else: + accesses = retrieve_terminals(exprs, **kwargs) + + # Handle TensorMoves + key = lambda i: isinstance(i, TensorMove) + tmovs, other = split(accesses, key) + accesses = {i.access for i in tmovs} | other - return compaccs | retrieve_terminals(exprs1, **kwargs) - set(subs.values()) + return accesses def disjoint_test(e0, e1, d, it): diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py index 86979c8bbd..7e466cf352 100644 --- a/devito/symbolics/extended_sympy.py +++ b/devito/symbolics/extended_sympy.py @@ -923,7 +923,12 @@ def _expected_alignment_elems(self): @property def base(self): - return self.args[0] + """ + The IndexedBase of the aligned access. + + To be implemented by subclasses. + """ + raise NotImplementedError func = Reserved._rebuild @@ -965,6 +970,10 @@ def _expected_alignment(self): f"Unsupported dtype `{self.function.dtype}` for VectorAccess" ) + @property + def base(self): + return self.args[0] + @property def indices(self): return self.base.indices diff --git a/devito/types/parallel.py b/devito/types/parallel.py index 09498df986..b92978de1f 100644 --- a/devito/types/parallel.py +++ b/devito/types/parallel.py @@ -408,25 +408,40 @@ class TensorMove(AlignedAccess, Terminal): Represent the LOAD/STORE of a multi-dimensional block of data from/to a higher level of the memory hierarchy. + TensorMoves can move blocks of data of arbitrary size without worrying about + potential out-of-bounds (OOB) accesses: + + * if an OOB point is read, the retrieved tensor is filled in with Nans/0s; + * attempted writes of OOB points via tensors exceeding the legal data region + are automatically discarded. + + Thus, TensorMoves do not require OOB protection with guards. + + However, the coordinates must be greater or equal than 0. + Parameters ---------- - base : IndexedBase - The base of the AbstractFunction subject of the TensorMove. + access : Indexed + The logical Indexed the TensorMove represents. tid0 : Dimension A representation of thread(s) issuing the TensorMove. coords : tuple The base address of the TensorMove (one point per Dimension). """ - __rargs__ = ('base', 'tid0', 'coords') + __rargs__ = ('access', 'tid0', 'coords') _expected_alignment = 16 """ The expected alignment in bytes for the underlying LOAD/STORE operation. """ - def __new__(cls, base, tid0, coords, **kwargs): - return super().__new__(cls, base, tid0, coords) + def __new__(cls, access, tid0, coords, **kwargs): + return super().__new__(cls, access, tid0, coords) + + @property + def access(self): + return self.args[0] @property def tid0(self): @@ -436,6 +451,14 @@ def tid0(self): def coords(self): return self.args[2] + @property + def function(self): + return self.access.function + + @property + def base(self): + return self.function.base + @cached_property def indexed(self): return self.function[self.coords] From f67a3c1e8c32f866bb62344979874cbcdec01898 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 5 Jun 2026 17:43:59 +0100 Subject: [PATCH 6/8] compiler: Revamp to enable blocking before CIRE --- devito/core/cpu.py | 8 +- devito/core/gpu.py | 10 +- devito/core/operator.py | 7 + devito/ir/clusters/cluster.py | 37 +++- devito/ir/support/guards.py | 25 +++ devito/ir/support/properties.py | 45 ++-- devito/ir/support/space.py | 34 ++- devito/ir/support/syncs.py | 3 + devito/passes/clusters/aliases.py | 324 ++++++++++++++++++----------- devito/passes/clusters/blocking.py | 153 +++++++------- devito/passes/iet/engine.py | 16 +- devito/types/dimension.py | 38 ++++ tests/test_dle.py | 56 ----- tests/test_dse.py | 22 ++ 14 files changed, 481 insertions(+), 297 deletions(-) diff --git a/devito/core/cpu.py b/devito/core/cpu.py index 3a55a4a4c5..79d1b409e7 100644 --- a/devito/core/cpu.py +++ b/devito/core/cpu.py @@ -5,8 +5,8 @@ from devito.operator.operator import rcompile from devito.passes import stream_dimensions from devito.passes.clusters import ( - Lift, blocking, buffering, cire, cse, factorize, fission, fuse, optimize_hyperplanes, - optimize_pows + Lift, apply_par_tiles, blocking, buffering, cire, cse, factorize, fission, fuse, + optimize_hyperplanes, optimize_pows ) from devito.passes.equations import collect_derivatives from devito.passes.iet import ( @@ -67,6 +67,7 @@ def _normalize_kwargs(cls, **kwargs): reduce=oo.pop('par-tile-reduce', None)) # CIRE + o['cire-block-temps'] = oo.pop('cire-block-temps', cls.CIRE_BLOCK_TEMPS) o['min-storage'] = oo.pop('min-storage', False) o['cire-rotate'] = oo.pop('cire-rotate', False) o['cire-maxpar'] = oo.pop('cire-maxpar', False) @@ -198,6 +199,9 @@ def _specialize_clusters(cls, clusters, **kwargs): if options['blocklazy']: clusters = blocking(clusters, sregistry, options) + # Unfold the `par-tile`s, if any + clusters = apply_par_tiles(clusters, **kwargs) + return clusters @classmethod diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 06cb42eb37..f907d4be87 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -7,8 +7,8 @@ from devito.operator.operator import rcompile from devito.passes import is_on_device, stream_dimensions from devito.passes.clusters import ( - Lift, blocking, buffering, cire, cse, factorize, fission, fuse, memcpy_prefetch, - optimize_pows, tasking + Lift, apply_par_tiles, blocking, buffering, cire, cse, factorize, fission, fuse, + memcpy_prefetch, optimize_pows, tasking ) from devito.passes.equations import collect_derivatives from devito.passes.iet import ( @@ -38,7 +38,9 @@ class DeviceOperatorMixin: + # Overrides the default values in the main Operator class BLOCK_LEVELS = 0 + CIRE_BLOCK_TEMPS = False MPI_MODES = (True, 'basic',) GPU_FIT = 'all-fallback' @@ -76,6 +78,7 @@ def _normalize_kwargs(cls, **kwargs): o['skewing'] = oo.pop('skewing', False) # CIRE + o['cire-block-temps'] = oo.pop('cire-block-temps', cls.CIRE_BLOCK_TEMPS) o['min-storage'] = False o['cire-rotate'] = False o['cire-maxpar'] = oo.pop('cire-maxpar', 'basic') @@ -239,6 +242,9 @@ def _specialize_clusters(cls, clusters, **kwargs): if options['blocklazy']: clusters = blocking(clusters, sregistry, options) + # Unfold the `par-tile`s, if any + clusters = apply_par_tiles(clusters, **kwargs) + return clusters @classmethod diff --git a/devito/core/operator.py b/devito/core/operator.py index 16c403ba79..2494dbd7cf 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -58,6 +58,13 @@ class BasicOperator(Operator): situations where the performance impact might be detrimental. """ + CIRE_BLOCK_TEMPS = True + """ + If an aliasing expression is computed within a blocked loop nest, all CIRE- + generated temporaries will inherit the block shape. If set to False, the + temporaries shape will systematically be defined by the root Dimensions. + """ + CIRE_MINGAIN = 10 """ Minimum operation count reduction for a redundant expression to be optimized diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py index cbf206b3ff..f40855121e 100644 --- a/devito/ir/clusters/cluster.py +++ b/devito/ir/clusters/cluster.py @@ -13,7 +13,7 @@ ) from devito.mpi.halo_scheme import HaloScheme, HaloTouch from devito.mpi.reduction_scheme import DistReduce -from devito.symbolics import estimate_cost +from devito.symbolics import estimate_cost, uxreplace from devito.tools import as_tuple, filter_ordered, flatten, infer_dtype from devito.types import ( CriticalRegion, Fence, Indexed, PhaseMarker, TensorMove, ThreadArrive, ThreadCommit, @@ -128,6 +128,33 @@ def rebuild(self, *args, **kwargs): syncs=kwargs.get('syncs', self.syncs), halo_scheme=kwargs.get('halo_scheme', self.halo_scheme)) + def subs(self, mapper, compact=()): + """ + Build a new Cluster applying substitutions rules to `self`. + """ + if not mapper: + return self + + if self.halo_scheme: + raise NotImplementedError + + key0 = lambda i: i.is_Block + subs0 = {d: self.ispace[d].promote(key0).dim for d in compact} + + subs = {**mapper, **subs0} + exprs = [uxreplace(e, subs) for e in self.exprs] + + ispace = self.ispace.switch(mapper) + key = lambda i: key0(i) and i in flatten(d._defines for d in subs0) + ispace = ispace.promote(key, mode='total') + + guards = self.guards.subs(mapper).promote(subs0) + properties = self.properties.subs(mapper).promote(subs0) + syncs = self.syncs.subs(mapper) + + return self.__class__(exprs=exprs, ispace=ispace, guards=guards, + properties=properties, syncs=syncs) + @property def exprs(self): return self._exprs @@ -591,6 +618,14 @@ def dspace(self): """Return the DataSpace of this ClusterGroup.""" return DataSpace.union(*[i.dspace.reset() for i in self]) + @property + def is_dense(self): + return all(i.is_dense for i in self) + + @property + def is_wild(self): + return all(i.is_wild for i in self) + @property def is_halo_touch(self): return all(i.is_halo_touch for i in self) diff --git a/devito/ir/support/guards.py b/devito/ir/support/guards.py index b8a335b1f4..ac808eeb89 100644 --- a/devito/ir/support/guards.py +++ b/devito/ir/support/guards.py @@ -342,6 +342,31 @@ def as_map(self, d, cls): return dict(i.args for i in search(self.get(d), cls)) + def subs(self, mapper): + m = {mapper.get(d, d): v.xreplace(mapper) for d, v in self.items()} + + return Guards(m) + + def promote(self, subs): + m = self + for d, v in subs.items(): + guards = {self.get(i) for i in d._defines} - {true} + if len(guards) > 1: + raise NotImplementedError( + f"Cannot promote {d} to {v} due to multiple guards: {guards}" + ) + elif len(guards) == 0: + continue + + guard = guards.pop() + guard = guard.xreplace({d: v}) + + m = m.impose(v, guard) + + m = m.popany(subs) + + return m + class GuardExpr(LocalObject, BooleanFunction): diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py index 9e787a8b9e..b0033b2485 100644 --- a/devito/ir/support/properties.py +++ b/devito/ir/support/properties.py @@ -39,10 +39,11 @@ def __init__(self, name, val=None): TILABLE = Property('tilable') """A fully parallel Dimension that would benefit from tiling (or "blocking").""" -TILABLE_SMALL = Property('tilable*') +NO_TUNING = Property('notuning') """ -Like TILABLE, but it would benefit from relatively small block, since the -iteration space is likely to be very small. +A Dimension that would be unlikely to benefit from tuning. For example, the +underlying iteration space is relatively small, and/or the enclosed expressions +are not characterized by data reuse, etc. """ SKEWABLE = Property('skewable') @@ -115,7 +116,6 @@ def __init__(self, name, val=None): # Bundles PARALLELS = {PARALLEL, PARALLEL_INDEP, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT} -TILABLES = {TILABLE, TILABLE_SMALL} def normalize_properties(*args): @@ -253,16 +253,10 @@ def prefetchable(self, dims, v=PREFETCHABLE): m[d] = self.get(d, set()) | {v} return Properties(m) - def block(self, dims, kind='default'): - if kind == 'default': - p = TILABLE - elif kind == 'small': - p = TILABLE_SMALL - else: - raise ValueError + def block(self, dims): m = dict(self) for d in as_tuple(dims): - m[d] = set(self.get(d, [])) | {p} + m[d] = set(self.get(d, [])) | {TILABLE} return Properties(m) def inbound(self, dims): @@ -289,6 +283,9 @@ def init_halo_right_shm(self, dims): INIT_HALO_LEFT_SHM}) return properties + def notune(self, dims): + return self.add(dims, NO_TUNING) + def is_parallel(self, dims): return any(len(self[d] & {PARALLEL, PARALLEL_INDEP}) > 0 for d in as_tuple(dims)) @@ -310,10 +307,7 @@ def is_sequential(self, dims): return any(SEQUENTIAL in self.get(d, ()) for d in as_tuple(dims)) def is_blockable(self, d): - return bool(self.get(d, set()) & {TILABLE, TILABLE_SMALL}) - - def is_blockable_small(self, d): - return TILABLE_SMALL in self.get(d, set()) + return bool(TILABLE in self.get(d, ())) def _is_property_any(self, dims, v): if dims is None: @@ -335,6 +329,25 @@ def is_halo_right_init(self, dims=None): def is_halo_init(self, dims=None): return self.is_halo_left_init(dims) or self.is_halo_right_init(dims) + def avoid_tuning(self, dims): + return any(NO_TUNING in self.get(d, set()) for d in as_tuple(dims)) + + def subs(self, mapper): + return Properties({mapper.get(d, d): v for d, v in self.items()}) + + def promote(self, subs): + m = self + for d, pd in subs.items(): + if pd not in d._defines: + raise ValueError(f"Cannot promote {d} to {pd} as {pd} does not " + f"belong to {d}'s hierarchy") + + v = normalize_properties(*[self.get(i, set()) for i in d._defines]) + + m = self.drop(d._defines).add(pd, v) + + return m + @property def nblockable(self): return sum([self.is_blockable(d) for d in self]) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 1c760128b5..96f3e3be07 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -478,7 +478,12 @@ def relaxed(self, d=None): def promote(self, cond, mode='unordered'): intervals = [i.promote(cond) for i in self] - intervals = IntervalGroup(intervals, relations=None, mode=mode) + + if mode == 'total': + relations = [filter_ordered(i.dim for i in intervals)] + else: + relations = None + intervals = IntervalGroup(intervals, relations=relations, mode=mode) # There could be duplicate Dimensions at this point, so we sum up the # Intervals defined over the same Dimension to produce a well-defined @@ -899,16 +904,29 @@ def augment(self, sub_iterators): return IterationSpace(self.intervals, items, self.directions) - def switch(self, d0, d1, direction=None): - intervals = self.intervals.switch(d0, d1) + def switch(self, d0, d1=None, direction=None): + """ + Construct a new IterationSpace in which the Dimension `d0` is replaced with + `d1`. Optionally, `d0` could be a mapper, in which case multiple Dimensions + may be switched. + """ + if isinstance(d0, dict): + mapper = d0 + else: + mapper = {d0: d1} + intervals = self.intervals sub_iterators = dict(self.sub_iterators) - sub_iterators.pop(d0, None) - sub_iterators[d1] = () - directions = dict(self.directions) - v = directions.pop(d0, None) - directions[d1] = direction or v + + for d0, d1 in mapper.items(): + intervals = intervals.switch(d0, d1) + + sub_iterators.pop(d0, None) + sub_iterators[d1] = () + + v = directions.pop(d0, None) + directions[d1] = direction or v return IterationSpace(intervals, sub_iterators, directions) diff --git a/devito/ir/support/syncs.py b/devito/ir/support/syncs.py index 67623e5fe0..d7f4f86bfe 100644 --- a/devito/ir/support/syncs.py +++ b/devito/ir/support/syncs.py @@ -175,6 +175,9 @@ def update(self, ops): m[d] = set(self.get(d, [])) | set(v) return Ops(m) + def subs(self, mapper): + return Ops({mapper.get(d, d): v for d, v in self.items()}) + def _get_sync(self, cls, dims=None): if dims is None: dims = list(self) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 20647c2084..0b81e5db9b 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -5,6 +5,7 @@ import numpy as np import sympy +from devito.arch import Device from devito.exceptions import CompilationError from devito.finite_differences import EvalDerivative, IndexDerivative, Weights from devito.ir import ( @@ -18,12 +19,12 @@ uxreplace ) from devito.tools import ( - Stamp, as_mapper, as_tuple, flatten, frozendict, generator, is_integer, split, - timed_pass + Reconstructable, Stamp, as_mapper, as_tuple, flatten, frozendict, generator, + is_integer, split, timed_pass ) from devito.types import ( - CustomDimension, Eq, Hyperplane, IncrDimension, Indexed, ModuloDimension, Size, - StencilDimension, Symbol, Temp, TempArray, TempFunction + BlockDimension, CustomDimension, Eq, Hyperplane, IncrDimension, Indexed, + ModuloDimension, Size, StencilDimension, Symbol, Temp, TempArray, TempFunction ) from devito.types.grid import MultiSubDimension @@ -112,6 +113,7 @@ def __init__(self, sregistry, options, platform): self.sregistry = sregistry self.platform = platform + self.opt_block_temps = options['cire-block-temps'] self.opt_minstorage = options['min-storage'] self.opt_rotate = options['cire-rotate'] self.opt_ftemps = options['cire-ftemps'] @@ -129,7 +131,9 @@ def _aliases_from_clusters(self, cgroup, exclude, meta): exprs, aliases = self._choose(found, cgroup, mapper) # AliasList -> Schedule - schedule = lower_aliases(aliases, meta, self.opt_maxpar) + schedule = lower_aliases( + aliases, meta, self.opt_maxpar, self.opt_block_temps + ) variants.append(make_variant(schedule, exprs, mapper)) @@ -158,7 +162,8 @@ def _aliases_from_clusters(self, cgroup, exclude, meta): if self.opt_multisubdomain: processed = optimize_clusters_msds(processed) - # [Clusters]_k -> [Clusters]_{k+n} + # [Clusters]_m -> [Clusters]_{m} + rebuilt = [] for c in cgroup: n = len(c.exprs) cexprs, exprs = exprs[:n], exprs[n:] @@ -168,10 +173,16 @@ def _aliases_from_clusters(self, cgroup, exclude, meta): ispace = c.ispace.augment(schedule.dmapper) ispace = ispace.augment(schedule.rmapper) - processed.append(c.rebuild(exprs=cexprs, ispace=ispace)) + rebuilt.append(c.rebuild(exprs=cexprs, ispace=ispace)) assert len(exprs) == 0 + # [Clusters]_m -> [Clusters]_m (optimization) + if not self.opt_block_temps: + rebuilt = expose_tuning_knobs(rebuilt, self.sregistry) + + processed.extend(rebuilt) + return processed def process(self, clusters): @@ -384,10 +395,6 @@ def __init__(self, sregistry, options, platform): def process(self, clusters): processed = [] for c in clusters: - if not c.is_dense: - processed.append(c) - continue - # Rule out Dimension-independent dependencies, e.g.: # r0 = ... # u[x, y] = ... r0*a[x, y] ... @@ -405,6 +412,9 @@ def process(self, clusters): return processed def _generate(self, cgroup, exclude): + if not cgroup.is_dense: + return + exprs = cgroup.exprs # E.g., extract `u.dx*a*b` and `u.dx*a*c` from @@ -665,97 +675,91 @@ def collect(extracted, meta, minstorage): return aliases -def lower_aliases(aliases, meta, maxpar): +def lower_aliases(aliases, meta, opt_maxpar, opt_block_temps): """ Create a Schedule from an AliasList. """ - dmapper = {} + dmapper = make_blockdim_subiter(meta.ispace.itdims) + processed = [] for a in aliases: - imapper = {**{i.dim: i for i in a.intervals}, - **{i.dim.parent: i for i in a.intervals if i.dim.is_NonlinearDerived}} + imapper = { + **{i.dim: i for i in a.intervals}, + **{i.dim.parent: i for i in a.intervals if i.dim.is_NonlinearDerived} + } - intervals = [] writeto = [] + intervals = {} sub_iterators = {} indicess = [[] for _ in a.distances] for i in meta.ispace: + d = i.dim + try: - interval = imapper[i.dim] + interval = imapper[d] except KeyError: - if i.dim in a.free_symbols: + if d in a.free_symbols: # Special case: the Dimension appears within the alias but # not as an Indexed index. Then, it needs to be added to # the `writeto` region too interval = i else: # E.g., `x0_blk0` or (`a[y_m+1]` => `y not in imapper`) - intervals.append(i) + intervals[d] = i continue if not (writeto or interval != interval.zero() or - (maxpar and SEQUENTIAL not in meta.properties.get(i.dim))): - # The alias doesn't require a temporary Dimension along i.dim - intervals.append(i) + (opt_maxpar and SEQUENTIAL not in meta.properties.get(d))): + # The alias doesn't require a temporary Dimension along `d` + intervals[d] = i continue - assert not i.dim.is_NonlinearDerived + assert not d.is_NonlinearDerived - # `i.dim` is necessarily part of the write-to region, so - # we have to adjust the Interval's stamp. For example, consider - # `i=x[0,0]<1>` and `interval=x[-4,4]<0>`; here we need to - # use `<1>` as stamp, which is what appears in `ispace` + # `d` is necessarily part of the write-to region, so we have to + # adjust the Interval's stamp. For example, consider `i=x[0,0]<1>` + # and `interval=x[-4,4]<0>`; here we need to use `<1>` as stamp, + # which is what appears in `ispace` interval = interval.lift(i.stamp) - writeto.append(interval) - intervals.append(interval) - - if i.dim.is_Block: - # Suitable IncrDimensions must be used to avoid OOB accesses. - # E.g., r[xs][ys][z] => both `xs` and `ys` must be initialized such - # that all accesses are within bounds. This requires traversing the - # hierarchy of BlockDimensions to set `xs` (`ys`) in a way that - # consecutive blocks access consecutive regions in `r` (e.g., - # `xs=x0_blk1-x0_blk0` with `blocklevels=2`; `xs=0` with - # `blocklevels=1`, that is it degenerates in this case) - try: - d = dmapper[i.dim] - except KeyError: - dd = i.dim.parent - assert dd.is_Block - if dd.parent.is_Block: - # A BlockDimension in between BlockDimensions - m = i.dim.symbolic_min - i.dim.parent.symbolic_min - else: - m = 0 - d = dmapper[i.dim] = IncrDimension( - f"{i.dim.name}s", i.dim, m, dd.symbolic_size, 1, dd.step - ) - sub_iterators[i.dim] = d + if opt_block_temps: + sub_iterators.update(dmapper) + writeto.append(interval) + intervals[d] = interval + elif d.is_Block: + pd = d.parent + writeto.append(interval.relaxed) + # The lower/upper bounds belong to the parent BlockDimension + intervals[d] = interval.zero() + intervals[pd] = interval.switch(pd) else: - d = i.dim + writeto.append(interval.relaxed) + intervals[d] = interval + + di = sub_iterators.get(d, d) # Given the iteration `interval`, lower distances to indices for distance, indices in zip(a.distances, indicess, strict=True): - v = distance[interval.dim] or 0 + v = distance[i.dim] or 0 try: - indices.append(d - interval.lower + v) + indices.append(di - interval.lower + v) except TypeError: - indices.append(d) + indices.append(di) # The alias write-to space writeto = IterationSpace(IntervalGroup(writeto), sub_iterators) # Avoid scalar aliases in the presence of guards, since hoisting them - # might cause scope issues (see `test_dse.py::TestAliases::test_split_cond`) + # might cause issues (see `test_dse.py::TestAliases::test_split_cond`) if not writeto and meta.guards: continue # The alias iteration space - ispace = IterationSpace(IntervalGroup(intervals, meta.ispace.relations), - meta.ispace.sub_iterators, - meta.ispace.directions) + ispace = IterationSpace( + IntervalGroup(intervals.values(), meta.ispace.relations), + meta.ispace.sub_iterators, meta.ispace.directions + ) ispace = ispace.augment(sub_iterators) # For now, the guards coincide with the original ones, if any @@ -878,8 +882,8 @@ def optimize_schedule_rotations(schedule, sregistry): aispace = aispace.augment({d: mds + [ii]}) ispace = IterationSpace.union(rispace, aispace, relations={(d, cd, d1)}) - processed.append(ScheduledAlias( - pivot, writeto, ispace, i.aliaseds, indicess, i.guards + processed.append(i._rebuild( + pivot=pivot, writeto=writeto, ispace=ispace, indicess=indicess )) # Update the rotations mapper @@ -906,10 +910,7 @@ def optimize_schedule_maxpar_basic(schedule): writeto = writeto0.lift(dims, stamp) ispace = ispace0.lift(dims, stamp) - processed.extend([ - ScheduledAlias(pivot, writeto, ispace, aliaseds, indicess, guards) - for pivot, _, _, aliaseds, indicess, guards in g - ]) + processed.extend(i._rebuild(writeto=writeto, ispace=ispace) for i in g) return schedule.rebuild(*processed) @@ -927,31 +928,34 @@ def optimize_schedule_maxpar_compact(schedule): ispace_union = IterationSpace.union(*[i.ispace for i in schedule]) processed = [] - for pivot, writeto, ispace, aliaseds, indicess, guards in schedule: - dims = writeto.itdims + for sa in schedule: + guards = sa.guards # Do we need to introduce guards to preserve the original semantics? - for d in dims: - int0 = ispace[d] - int1 = ispace_union[d] + for i in sa.indices: + for d in i._defines: + try: + int0, int1 = sa.ispace[d], ispace_union[d] + except KeyError: + continue - if int0 != int1: - guard = sympy.And(d >= int0.symbolic_min, d <= int0.symbolic_max) - guards = guards.xandg(d, guard) + if int0 != int1: + guards = guards.xandg(i, sympy.And( + i >= int0.symbolic_min, i <= int0.symbolic_max + )) + break # This is conceptually identical to what we do for the `basic` mode - writeto = writeto.lift(dims, stamp) + dims = sa.writeto.itdims + writeto = sa.writeto.lift(dims, stamp) ispace = ispace_union.lift(dims, stamp) - processed.append(ScheduledAlias( - pivot, writeto, ispace, aliaseds, indicess, guards - )) + processed.append(sa._rebuild(writeto=writeto, ispace=ispace, guards=guards)) return schedule.rebuild(*processed) -def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype, - opt_minmem): +def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype, opt_minmem): """ Turn a Schedule into a sequence of Clusters. """ @@ -960,28 +964,12 @@ def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype, clusters = [] subs = {} - for pivot, writeto, ispace, aliaseds, indicess, guards in schedule: + for sa in schedule: + pivot, writeto, ispace, aliaseds, indicess, guards = sa + name = sregistry.make_name() - # Infer the dtype for the pivot - # This prevents cases such as `floor(a*b)` with `a` and `b` floats - # that would creat a temporary `int r = b` leading to erroneous - # numerical results if writeto: - # The Dimensions defining the shape of Array - # Note: with SubDimensions, we may have the following situation: - # - # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... - # r[zi] = ... - # - # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll - # need as much room as in `zi`'s parent to avoid going OOB Aside - # from ugly generated code, the reason we do not rather shift the - # indices is that it prevents future passes to transform the loop - # bounds (e.g., MPI's comp/comm overlap does that) - dimensions = [d.parent if d.is_AbstractSub else d - for d in writeto.itdims] - # The minimum halo required along each Dimension depends on `writeto`. # The user might suggest to go more relaxed about this via `opt_minmem`, # in which case we extend the halo based on the surrounding @@ -1002,17 +990,20 @@ def lower_schedule(schedule, meta, sregistry, opt_ftemps, opt_min_dtype, shift = [halo[d].left - min_halo[d].left for d in writeto.itdims] halo = tuple(halo.values()) - # The indices used to write into the Array + # The dimensions of the Array, which together with `halo`, will define + # its shape + dimensions = sa.dimensions + + # The indices used to populate the Array indices = [] - for i, s in zip(writeto, shift, strict=True): + for interval, i, s in zip(writeto, sa.indices, shift, strict=True): try: # E.g., `xs` - sub_iterators = writeto.sub_iterators[i.dim] - assert len(sub_iterators) <= 1 - indices.append(sub_iterators[0] + s) - except (KeyError, IndexError): + di, = writeto.sub_iterators[i] + indices.append(di + s) + except (KeyError, ValueError): # E.g., `z` -- a non-shifted Dimension - indices.append(i.dim - i.lower + s) + indices.append(i - interval.lower + s) dtype = sympy_dtype(pivot, base=meta.dtype) obj = make(name=name, dimensions=dimensions, halo=halo, dtype=dtype, @@ -1070,21 +1061,34 @@ def optimize_clusters_msds(clusters): processed = [] for c in clusters: msds = [d for d in c.ispace.itdims if isinstance(d, MultiSubDimension)] + if not msds: + processed.append(c) + continue - if msds: - mapper = {d: d.root for d in msds} - exprs = [uxreplace(e, mapper) for e in c.exprs] + mapper = {d: d.root for d in msds} - ispace = c.ispace.relaxed(msds) + processed.append(c.subs(mapper)) - guards = {mapper.get(d, d): v for d, v in c.guards.items()} - properties = {mapper.get(d, d): v for d, v in c.properties.items()} - syncs = {mapper.get(d, d): v for d, v in c.syncs.items()} + return processed - processed.append(c.rebuild(exprs=exprs, ispace=ispace, guards=guards, - properties=properties, syncs=syncs)) - else: - processed.append(c) + +def expose_tuning_knobs(clusters, sregistry): + """ + Replace all pre-existing BlockDimensions with fresh ones, to enable + separate tuning for the CIRE-generated temporaries. + """ + # Create the new BlockDimensions + callback = lambda i: sregistry.make_name(prefix=i) + + mapper = {} + for d in set().union(*[c.used_dimensions for c in clusters]): + if d.is_Block: + mapper.update(d._rebuild_hierarchy(callback)) + + if not mapper: + return clusters + + processed = [c.subs(mapper) for c in clusters] return processed @@ -1474,18 +1478,64 @@ def is_frame(self): return all(i.is_frame for i in self._list) -ScheduledAlias = namedtuple('SchedAlias', - 'pivot writeto ispace aliaseds indicess guards') -ScheduledAlias.__new__.__defaults__ = (None, None, None, None, None, None) +class ScheduledAlias(tuple, Reconstructable): + + __rkwargs__ = ('pivot', 'writeto', 'ispace', 'aliaseds', 'indicess', 'guards') + + def __new__(cls, pivot=None, writeto=None, ispace=None, aliaseds=None, + indicess=None, guards=None): + obj = super().__new__(cls, (pivot, writeto, ispace, aliaseds, indicess, + guards)) + + assert len(aliaseds) > 0 + assert len(aliaseds) == len(indicess) + + obj.pivot = pivot + obj.writeto = writeto + obj.ispace = ispace + obj.aliaseds = aliaseds + obj.indicess = indicess + obj.guards = guards + + return obj + + def __repr__(self): + return f"ScheduledAlias<<{self.pivot}>>" + + __str__ = __repr__ + + @property + def dimensions(self): + # Legacy: with SubDimensions, we may have the following situation: + # + # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... + # r[zi] = ... + # + # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll + # need as much room as in `zi`'s parent to avoid going out-of-bounds. + # Aside from ugly generated code, the reason we do not rather shift the + # indices is that it prevents future passes to transform the loop + # bounds (e.g., MPI's comp/comm overlap does that) + return tuple(d.parent if d.is_AbstractSub else d + for d in self.writeto.itdims) + + @property + def indices(self): + bdims = [d for d in self.ispace.itdims if d.is_Block] + depth = max([d._depth for d in bdims], default=0) + mapper = {d.root: d for d in bdims if d._depth == depth} + return tuple(mapper.get(d.root, d) for d in self.writeto.itdims) class Schedule(tuple): def __new__(cls, *items, dmapper=None, rmapper=None, is_frame=False): obj = super().__new__(cls, items) + obj.dmapper = dmapper or {} obj.rmapper = rmapper or {} obj.is_frame = is_frame + return obj def rebuild(self, *items, dmapper=None, rmapper=None, is_frame=False): @@ -1634,3 +1684,37 @@ def _(expr): @_deindexify.register(Indexed) def _(expr): return expr.function, {} + + +def make_blockdim_subiter(dimensions): + """ + Create sub-iterators for the BlockDimension in `dimensions`. + + For example, in `r[xs][ys][z]` both `xs` and `ys` must be initialized such + that all accesses are within bounds. This requires traversing the hierarchy + of BlockDimensions to set `xs` (`ys`) in a way that consecutive blocks access + consecutive regions in `r` (e.g., trivially `xs=0` with `blocklevels=1`; + `xs=x0_blk1-x0_blk0` with `blocklevels=2`; and so on). + """ + depth = max([d._depth for d in dimensions if d.is_Block], default=0) + if depth <= 1: + return {} + + mapper = {} + for d in dimensions: + if not d.is_Block or d._depth < depth: + continue + + pd = d.parent + + if pd.parent.is_Block: + # A BlockDimension in between BlockDimensions + m = d.symbolic_min - pd.symbolic_min + else: + m = 0 + + mapper[d] = IncrDimension( + f"{d.name}s", d, m, pd.symbolic_size, 1, pd.step + ) + + return mapper diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index e9076bfa85..af659193fc 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -1,9 +1,11 @@ +from itertools import groupby + from sympy import sympify from devito.finite_differences.differentiable import IndexSum from devito.ir.clusters import Queue from devito.ir.support import ( - AFFINE, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT, SKEWABLE, TILABLES, Interval, + AFFINE, PARALLEL, PARALLEL_IF_ATOMIC, PARALLEL_IF_PVT, SKEWABLE, TILABLE, Interval, IntervalGroup, IterationSpace, Scope ) from devito.passes import is_on_device @@ -14,7 +16,7 @@ ) from devito.types import BlockDimension -__all__ = ['blocking'] +__all__ = ['blocking', 'apply_par_tiles'] def blocking(clusters, sregistry, options): @@ -205,12 +207,11 @@ def callback(self, clusters, prefix): # along more than three dimensions return clusters - if any(self._has_short_trip_count(i) for i in c.ispace.itdims): - properties = c.properties.block(d, 'small') - elif self._has_data_reuse(c): - properties = c.properties.block(d) - else: - properties = c.properties.block(d, 'small') + properties = c.properties.block(d) + + if any(self._has_short_trip_count(i) for i in c.ispace.itdims) or \ + not self._has_data_reuse(c): + properties = properties.notune(d) elif self._has_data_reuse(c): properties = c.properties.block(d) @@ -239,7 +240,7 @@ def process(self, clusters): if c.properties.nblockable > 1: processed.append(c) else: - properties = c.properties.drop(properties=TILABLES) + properties = c.properties.drop(properties=TILABLE) processed.append(c.rebuild(properties=properties)) return processed @@ -316,13 +317,13 @@ def callback(self, clusters, prefix): class SynthesizeBlocking(Queue): + #TODO: DROP? template = "%s%d_blk%s" def __init__(self, sregistry, options): self.sregistry = sregistry self.levels = options['blocklevels'] - self.par_tile = options['par-tile'] # Track the BlockDimensions created so far so that we can reuse them # in case of Clusters that are different but share the same number of @@ -332,10 +333,7 @@ def __init__(self, sregistry, options): super().__init__() def process(self, clusters): - # A tool to unroll the explicit integer block shapes, should there be any - blk_size_gen = BlockSizeGenerator(self.par_tile) if self.par_tile else None - - return self._process_fdta(clusters, 1, blk_size_gen=blk_size_gen) + return self._process_fdta(clusters, 1) def _make_key_guards(self, cluster, ispace): # Essentially, this will ensure that loop nests within guards *not* @@ -344,25 +342,18 @@ def _make_key_guards(self, cluster, ispace): return tuple(cluster.guards.get(i.dim) for i in ispace if not cluster.properties.is_blockable(i.dim)) - def _derive_block_dims(self, clusters, prefix, d, blk_size_gen): - if blk_size_gen is not None: - step = sympify(blk_size_gen.next(prefix, d, clusters)) - else: - # This will result in a parametric step, e.g. `x0_blk0_size` - step = None - + def _derive_block_dims(self, clusters, prefix, d): # Can I reuse existing BlockDimensions to avoid a proliferation of steps? k = stencil_footprint(clusters, d) - if step is None: - try: - return self.mapper[k] - except KeyError: - pass + try: + return self.mapper[k] + except KeyError: + pass base = self.sregistry.make_name(prefix=d.root.name) name = self.sregistry.make_name(prefix=f"{base}_blk") - bd = BlockDimension(name, d, d.symbolic_min, d.symbolic_max, step) + bd = BlockDimension(name, d, d.symbolic_min, d.symbolic_max) step = bd.step block_dims = [bd] @@ -378,7 +369,7 @@ def _derive_block_dims(self, clusters, prefix, d, blk_size_gen): return retval - def callback(self, clusters, prefix, blk_size_gen=None): + def callback(self, clusters, prefix): if not prefix: return clusters @@ -388,9 +379,7 @@ def callback(self, clusters, prefix, blk_size_gen=None): return clusters try: - block_dims, bd = self._derive_block_dims( - clusters, prefix, d, blk_size_gen - ) + block_dims, bd = self._derive_block_dims(clusters, prefix, d) except StopIteration: return clusters @@ -408,7 +397,7 @@ def callback(self, clusters, prefix, blk_size_gen=None): # The new Cluster properties -- TILABLE is dropped after blocking properties = c.properties.drop(d) - properties = properties.add(block_dims, c.properties[d] - TILABLES) + properties = properties.add(block_dims, c.properties[d] - {TILABLE}) processed.append(c.rebuild(exprs=exprs, ispace=ispace, guards=guards, properties=properties)) @@ -508,7 +497,6 @@ class BlockSizeGenerator: """ def __init__(self, par_tile): - self.tip = -1 self.umt = par_tile if par_tile.is_multi: @@ -537,56 +525,25 @@ def __init__(self, par_tile): else: self.umt_reduce = UnboundTuple(*par_tile.default, 1) - def next(self, prefix, d, clusters): - # If a whole new set of Dimensions, move the tip -- this means `clusters` - # at `d` represents a new loop nest or kernel - x = any(i.dim.is_Block for i in flatten(c.ispace for c in clusters)) - if not x: - self.tip += 1 - - # Correctness -- enforce blocking where necessary. - # See also issue #276:PRO - if any(c.properties.is_parallel_atomic(d) for c in clusters): + def schedule(self, dims, clusters): + if any(c.properties.is_parallel_atomic(dims) for c in clusters): + # Correctness -- enforce blocking where necessary. + # See also issue #276:PRO if any(c.is_sparse for c in clusters): - if not x: - self.umt_sparse.iter() - return self.umt_sparse.next() - else: - if not x: - self.umt_reduce.iter() - return self.umt_reduce.next() - - # Performance heuristics -- use a smaller par-tile - if all(c.properties.is_blockable_small(d) for c in clusters): - if not x: - self.umt_small.iter() - return self.umt_small.next() - - # We can't `self.umt.iter()` because we might still want to - # fallback to `self.umt_small` - item = self.umt.curitem() if x else self.umt.nextitem() - - # Handle user-provided rules - # TODO: This is also rudimentary - if item.rule is None: - umt = self.umt - elif is_integer(item.rule): - if item.rule == self.tip: - umt = self.umt + umt = self.umt_sparse else: - umt = self.umt_small - if not x: - umt.iter() + umt = self.umt_reduce + + elif all(c.properties.avoid_tuning(dims) for c in clusters): + # Performance heuristics -- use a smaller par-tile + umt = self.umt_small + else: - if item.rule in {d.name for d in prefix.itdims}: - umt = self.umt - else: - # This is like "pattern unmatched" -- fallback to `umt_small` - umt = self.umt_small - if not x: - umt.iter() + umt = self.umt + + umt.iter() - return umt.next() + return umt class SynthesizeSkewing(Queue): @@ -661,3 +618,41 @@ def callback(self, clusters, prefix): processed.append(c.rebuild(exprs=exprs, ispace=ispace)) return processed + + +def apply_par_tiles(clusters, options, **kwargs): + """ + Use the par-tile parameter to replace the symbolic BlockDimension sizes + with actual integer numbers representing the block shape. + """ + if not options['par-tile']: + return clusters + + blk_size_gen = BlockSizeGenerator(options['par-tile']) + + key0 = lambda d: d.is_Block and d._depth == 2 + key = lambda c: c.ispace.project(key0).itdims + + processed = [] + for bdims, group in groupby(clusters, key=key): + g = list(group) + + if not bdims: + processed.extend(g) + continue + + umt = blk_size_gen.schedule(bdims, g) + + subs = {} + compact = [] + for d in reversed(bdims): + try: + step = sympify(umt.next()) + subs.update(d._rebuild_hierarchy(step=step)) + except StopIteration: + # No more par-tile sizes available, so we drop the BlockDimensions + compact.append(d) + + processed.extend(c.subs(subs, compact) for c in g) + + return processed diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 9b936fba76..0df11d8afb 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -624,20 +624,10 @@ def _(i, mapper, sregistry): if i._depth != 2: return - p = i.parent - pp = i.parent.parent + callback = lambda i: sregistry.make_name(prefix=i) + subs = i._rebuild_hierarchy(callback) - name0 = pp.name - base = sregistry.make_name(prefix=name0) - name1 = sregistry.make_name(prefix=f'{base}_blk') - - bd = i.parent._rebuild(name1, pp) - d = i._rebuild(name0, bd, i._min.subs(p, bd), i._max.subs(p, bd)) - - mapper.update({ - i: d, - i.parent: bd - }) + mapper.update(subs) @abstract_object.register(IncrDimension) diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 41c1f90b88..665dda08d5 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -510,6 +510,16 @@ def __init_finalize__(self, name, parent): self.is_Time = parent.is_Time self.is_Space = parent.is_Space + def _rebuild_hierarchy(self, callback): + """ + Recursively rebuild the hierarchy of DerivedDimensions rooted at `self`. + + To be overridden by subclasses, since different DerivedDimensions may + have different requirements, and it'd be cumbersome to generalise it + all here. + """ + raise NotImplementedError + @property def parent(self): return self._parent @@ -1319,6 +1329,34 @@ class BlockDimension(AbstractIncrDimension): is_Block = True is_PerfKnob = True + def _rebuild_hierarchy(self, callback=None, step=None): + if self._depth != 2: + raise NotImplementedError + + p = self.parent + pp = self.parent.parent + + name0 = pp.name + + if callback is None: + name1 = p.name + else: + base = callback(name0) + name1 = callback(f'{base}_blk') + + bd = p._rebuild(name1, pp, step=step or p.step) + + subs = {p: bd} + if step is not None: + subs[p.step] = step + + d = self._rebuild( + name0, bd, + self._min.subs(subs), self._max.subs(subs), size=self.size.subs(subs) + ) + + return {self: d, p: bd} + @cached_property def _arg_names(self): try: diff --git a/tests/test_dle.py b/tests/test_dle.py index 2fd090b310..e3edbcad46 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -376,62 +376,6 @@ def test_structure_2p5D(self): assert iters[0].step == par_tile[1] assert iters[1].step == par_tile[0] - def test_custom_rule0(self): - grid = Grid(shape=(8, 8, 8)) - - u = TimeFunction(name="u", grid=grid, space_order=4) - v = TimeFunction(name="v", grid=grid, space_order=4) - - eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx), - Eq(v.forward, u.forward.dx)] - - # "Apply par-tile=(4, 4, 4) to the loop nest (kernel) with id (rule)=1, - # and use default for the rest!" - par_tile = (4, 4, 4) - rule = 1 - - op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule), - 'blockinner': True})) - - # Check generated code. By having specified "1" as rule, we expect the - # given par-tile to be applied to the kernel with id 1 - bns, _ = assert_blocking(op, {'z0_blk0', 'x0_blk0', 'z2_blk0'}) - root = bns['x0_blk0'] - iters = FindNodes(Iteration).visit(root) - iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1] - assert len(iters) == 3 - assert all(i.step == j for i, j in zip(iters, par_tile, strict=True)) - - def test_custom_rule1(self): - grid = Grid(shape=(8, 8, 8)) - x, y, z = grid.dimensions - - f = Function(name='f', grid=grid) - u = TimeFunction(name="u", grid=grid, space_order=4) - v = TimeFunction(name="v", grid=grid, space_order=4) - - eqns = [Eq(u.forward, u.dz.dy + u.dx.dz + u.dy.dx + cos(f)*cos(f[x+1, y, z])), - Eq(v.forward, u.forward.dx)] - - # "Apply par-tile=(4, 4, 4) to the loop nests (kernels) embedded within - # the time loop, and use default for the rest!" - par_tile = (4, 4, 4) - rule = grid.time_dim.name # We must be able to infer it from str - - op = Operator(eqns, opt=('advanced-fsg', {'par-tile': (par_tile, rule), - 'blockinner': True, - 'blockrelax': True})) - - # Check generated code. By having specified "time" as rule, we expect the - # given par-tile to be applied to the kernel within the time loop - bns, _ = assert_blocking(op, {'x0_blk0', 'x1_blk0', 'x2_blk0'}) - for i in ['x0_blk0', 'x1_blk0', 'x2_blk0']: - root = bns[i] - iters = FindNodes(Iteration).visit(root) - iters = [i for i in iters if i.dim.is_Block and i.dim._depth == 1] - assert len(iters) == 3 - assert all(i.step == j for i, j in zip(iters, par_tile, strict=True)) - @pytest.mark.parametrize("shape", [(10,), (10, 45), (20, 33), (10, 31, 45), (45, 31, 45)]) @pytest.mark.parametrize("time_order", [2]) diff --git a/tests/test_dse.py b/tests/test_dse.py index 9c3335c8af..dca6754b6c 100644 --- a/tests/test_dse.py +++ b/tests/test_dse.py @@ -2868,6 +2868,28 @@ def test_scalar_with_cond_tinvariant(self): assert str(body0(op).body[0]) == 'const float r0 = 1.0F/dt;' assert not body0(op).body[0].ispace + def test_block_temps_false(self): + grid = Grid(shape=(34, 45, 50)) + + a = Function(name='a', grid=grid, space_order=8) + u = TimeFunction(name='u', grid=grid, space_order=8) + v = u.func(name='v') + + eqn = Eq(v.forward, a.dy.dz*u.dx.dy + v) + + op = Operator(eqn, opt=('advanced', {'openmp': False, + 'cire-block-temps': False, + 'cire-maxpar': 'basic'})) + + # Ensure it executes smoothly + op.apply(time_M=3) + + assert_structure( + op, + ['t,x0_blk0,y0_blk0,x,y,z', 't,x1_blk0,y1_blk0,x,y,z'], + 'tx0_blk0y0_blk0xyzx1_blk0y1_blk0xyz' + ) + class TestIsoAcoustic: From ff9b90636a22763b39227e7135a04b4387ab9a00 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 15 Jun 2026 15:43:10 +0100 Subject: [PATCH 7/8] compiler: Drop unused SynthesizeBlocking.template --- devito/passes/clusters/aliases.py | 5 ++--- devito/passes/clusters/blocking.py | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 0b81e5db9b..47ddd7cf89 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -5,7 +5,6 @@ import numpy as np import sympy -from devito.arch import Device from devito.exceptions import CompilationError from devito.finite_differences import EvalDerivative, IndexDerivative, Weights from devito.ir import ( @@ -23,8 +22,8 @@ is_integer, split, timed_pass ) from devito.types import ( - BlockDimension, CustomDimension, Eq, Hyperplane, IncrDimension, Indexed, - ModuloDimension, Size, StencilDimension, Symbol, Temp, TempArray, TempFunction + CustomDimension, Eq, Hyperplane, IncrDimension, Indexed, ModuloDimension, Size, + StencilDimension, Symbol, Temp, TempArray, TempFunction ) from devito.types.grid import MultiSubDimension diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index af659193fc..29f9791019 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -317,9 +317,6 @@ def callback(self, clusters, prefix): class SynthesizeBlocking(Queue): - #TODO: DROP? - template = "%s%d_blk%s" - def __init__(self, sregistry, options): self.sregistry = sregistry From f3c20e8d6b34edfb8dbefe131b88201fb431c3a8 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 16 Jun 2026 10:24:47 +0100 Subject: [PATCH 8/8] compiler: Comply with ruff --- devito/ir/support/space.py | 5 +---- devito/passes/clusters/aliases.py | 2 +- devito/passes/clusters/blocking.py | 2 +- tests/test_dimension.py | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/devito/ir/support/space.py b/devito/ir/support/space.py index 96f3e3be07..1efafe5a98 100644 --- a/devito/ir/support/space.py +++ b/devito/ir/support/space.py @@ -910,10 +910,7 @@ def switch(self, d0, d1=None, direction=None): `d1`. Optionally, `d0` could be a mapper, in which case multiple Dimensions may be switched. """ - if isinstance(d0, dict): - mapper = d0 - else: - mapper = {d0: d1} + mapper = d0 if isinstance(d0, dict) else {d0: d1} intervals = self.intervals sub_iterators = dict(self.sub_iterators) diff --git a/devito/passes/clusters/aliases.py b/devito/passes/clusters/aliases.py index 47ddd7cf89..aa5e955b6d 100644 --- a/devito/passes/clusters/aliases.py +++ b/devito/passes/clusters/aliases.py @@ -1706,7 +1706,7 @@ def make_blockdim_subiter(dimensions): pd = d.parent - if pd.parent.is_Block: + if pd.parent.is_Block: # noqa SIM108 # A BlockDimension in between BlockDimensions m = d.symbolic_min - pd.symbolic_min else: diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index 29f9791019..3a5636a91a 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -16,7 +16,7 @@ ) from devito.types import BlockDimension -__all__ = ['blocking', 'apply_par_tiles'] +__all__ = ['apply_par_tiles', 'blocking'] def blocking(clusters, sregistry, options): diff --git a/tests/test_dimension.py b/tests/test_dimension.py index 665a31af77..c8de790431 100644 --- a/tests/test_dimension.py +++ b/tests/test_dimension.py @@ -2072,7 +2072,7 @@ def test_blocking_w_guard(self): op = Operator(eqns, opt=('advanced', {'openmp': True})) - op.cfunction + _ = op.cfunction assert_structure(op, ['t,x0_blk0,y0_blk0,x,y,z', 't,x0_blk0,y0_blk0,x,y',