Skip to content

Commit b90a129

Browse files
authored
Merge pull request coredac#298 from coredac/fix/assertion-and-performance
Fix Assertion Crash and Optimize Compile Time
2 parents bc71a57 + 037287f commit b90a129

6 files changed

Lines changed: 84 additions & 116 deletions

File tree

lib/NeuraDialect/Architecture/Architecture.cpp

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,15 @@ void Architecture::applyTileOverrides(
314314
for (const auto &override : tile_overrides) {
315315
Tile *tile = nullptr;
316316
if (override.tile_x >= 0 && override.tile_y >= 0) {
317-
tile = getTile(override.tile_x, override.tile_y);
317+
// cloneWithNewDimensions() copies tile_overrides from the original
318+
// (larger) architecture and replays them on a smaller cloned grid.
319+
// An override may reference coordinates that exist in the original but
320+
// fall outside the cloned grid's bounds. Skips such overrides rather
321+
// than asserting, so the clone only applies overrides that are relevant
322+
// to its own coordinate space.
323+
auto it = coord_to_tile_.find({override.tile_x, override.tile_y});
324+
if (it != coord_to_tile_.end())
325+
tile = it->second;
318326
}
319327

320328
if (tile) {
@@ -518,8 +526,19 @@ void Architecture::applyLinkOverrides(
518526
}
519527
// Handles link removal.
520528
else {
521-
Tile *src_tile = getTile(override.src_tile_x, override.src_tile_y);
522-
Tile *dst_tile = getTile(override.dst_tile_x, override.dst_tile_y);
529+
// cloneWithNewDimensions() replays link_overrides from the original
530+
// architecture on a smaller cloned grid. Either endpoint of a link
531+
// override may reference a tile that was not included in the clone.
532+
// Skip the override in that case to avoid a crash; the link simply
533+
// does not exist in the cloned architecture and needs no override.
534+
auto src_it = coord_to_tile_.find(
535+
{override.src_tile_x, override.src_tile_y});
536+
auto dst_it = coord_to_tile_.find(
537+
{override.dst_tile_x, override.dst_tile_y});
538+
if (src_it == coord_to_tile_.end() || dst_it == coord_to_tile_.end())
539+
continue; // One or both tiles do not exist in this architecture.
540+
Tile *src_tile = src_it->second;
541+
Tile *dst_tile = dst_it->second;
523542

524543
if (src_tile && dst_tile) {
525544
bool link_already_exists = linkExists(src_tile, dst_tile);
@@ -653,11 +672,12 @@ void Architecture::removeTile(int tile_id) {
653672

654673
Link *Architecture::getLink(int src_tile_x, int src_tile_y, int dst_tile_x,
655674
int dst_tile_y) {
656-
Tile *src_tile = getTile(src_tile_x, src_tile_y);
657-
Tile *dst_tile = getTile(dst_tile_x, dst_tile_y);
658-
if (!src_tile || !dst_tile) {
675+
auto src_it = coord_to_tile_.find({src_tile_x, src_tile_y});
676+
auto dst_it = coord_to_tile_.find({dst_tile_x, dst_tile_y});
677+
if (src_it == coord_to_tile_.end() || dst_it == coord_to_tile_.end())
659678
return nullptr; // One of the tiles does not exist.
660-
}
679+
Tile *src_tile = src_it->second;
680+
Tile *dst_tile = dst_it->second;
661681

662682
for (const auto &[id, link] : link_storage_) {
663683
if (link && link->getSrcTile() == src_tile &&
@@ -718,12 +738,11 @@ void Architecture::removeLink(Tile *src_tile, Tile *dst_tile) {
718738

719739
void Architecture::removeLink(int src_tile_x, int src_tile_y, int dst_tile_x,
720740
int dst_tile_y) {
721-
Tile *src_tile = getTile(src_tile_x, src_tile_y);
722-
Tile *dst_tile = getTile(dst_tile_x, dst_tile_y);
723-
if (!src_tile || !dst_tile) {
741+
auto src_it = coord_to_tile_.find({src_tile_x, src_tile_y});
742+
auto dst_it = coord_to_tile_.find({dst_tile_x, dst_tile_y});
743+
if (src_it == coord_to_tile_.end() || dst_it == coord_to_tile_.end())
724744
return; // One of the tiles does not exist.
725-
}
726-
removeLink(src_tile, dst_tile);
745+
removeLink(src_it->second, dst_it->second);
727746
}
728747

729748
bool Architecture::canSupportCounter() const {

lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -347,30 +347,12 @@ class TaskDependencyGraph {
347347
return total;
348348
}
349349

350-
// Public wrapper for profileTask: used by UtilizationFuser to re-profile
351-
// fused tasks with the real downstream Neura pipeline.
352-
// When skip_mapper=true, only ResMII/RecMII analytical estimates are used
353-
// (no MapToAcceleratorPass). This is safe for speculative balance checks
354-
// where the mapper may backtrack indefinitely on larger tile arrays.
355-
void profileTaskPublic(TaskGraphNode *node, TaskflowTaskOp task,
356-
bool skip_mapper = false) {
357-
profileTask(node, task, skip_mapper);
358-
}
359-
360-
private:
361-
llvm::DenseSet<std::pair<TaskGraphNode *, TaskGraphNode *>> edge_set;
362-
363-
void addEdge(TaskGraphNode *from, TaskGraphNode *to) {
364-
auto key = std::make_pair(from, to);
365-
if (edge_set.insert(key).second) {
366-
from->successors.push_back(to);
367-
to->predecessors.push_back(from);
368-
}
369-
}
370-
371350
// Profiles a single TaskflowTaskOp: clones the task, wraps the kernel in a
372351
// standalone func, and runs InsertDataMov + MapToAcceleratorPass to obtain
373352
// ii. skip_mapper: use only ResMII/RecMII analytical estimates.
353+
// When skip_mapper=true, only ResMII/RecMII analytical estimates are used
354+
// (no MapToAcceleratorPass). This is safe for speculative balance checks
355+
// where the mapper may backtrack indefinitely on larger tile arrays.
374356
void profileTask(TaskGraphNode *node, TaskflowTaskOp task,
375357
bool skip_mapper = false) {
376358
MLIRContext *ctx = task.getContext();
@@ -456,6 +438,17 @@ class TaskDependencyGraph {
456438
tmp_mod.erase();
457439
}
458440

441+
private:
442+
llvm::DenseSet<std::pair<TaskGraphNode *, TaskGraphNode *>> edge_set;
443+
444+
void addEdge(TaskGraphNode *from, TaskGraphNode *to) {
445+
auto key = std::make_pair(from, to);
446+
if (edge_set.insert(key).second) {
447+
from->successors.push_back(to);
448+
to->predecessors.push_back(from);
449+
}
450+
}
451+
459452
// Wraps a neura.kernel into a standalone func in dst_module, runs
460453
// InsertDataMov + mapper, and returns compiled_ii / cp_depth.
461454
// x_tiles/y_tiles: multi-CGRA tile grid dimensions.
@@ -1725,7 +1718,7 @@ struct ResourceAwareTaskOptimizationPass
17251718
// mapper is skipped entirely (only ResMII/RecMII estimates are used).
17261719
auto profile_fn = [&graph, use_analytical](TaskGraphNode *node,
17271720
TaskflowTaskOp task) {
1728-
graph.profileTaskPublic(node, task, /*skip_mapper=*/use_analytical);
1721+
graph.profileTask(node, task, /*skip_mapper=*/use_analytical);
17291722
};
17301723
bool fuse_changed = fuser.fuse(func, graph, profile_fn);
17311724

@@ -1739,11 +1732,13 @@ struct ResourceAwareTaskOptimizationPass
17391732
}
17401733

17411734
// Phase 2: Latency-Aware Pipeline Balance.
1742-
// Balance probes use analytical-only profiling by default.
1743-
bool balance_skip = use_analytical || balanceSkipMapper.getValue();
1744-
auto balance_profile_fn = [&graph, balance_skip](TaskGraphNode *node,
1735+
// Balance probes always use analytical-only profiling (skip_mapper=true)
1736+
// to avoid exponential backtracking blowup during speculative probing.
1737+
// The balance-skip-mapper flag now only controls whether a final
1738+
// verification mapper run is performed after convergence (see below).
1739+
auto balance_profile_fn = [&graph](TaskGraphNode *node,
17451740
TaskflowTaskOp task) {
1746-
graph.profileTaskPublic(node, task, /*skip_mapper=*/balance_skip);
1741+
graph.profileTask(node, task, /*skip_mapper=*/true);
17471742
};
17481743
PipelineBalancer balancer;
17491744
bool balance_changed = balancer.balance(graph, balance_profile_fn);
@@ -1777,19 +1772,22 @@ struct ResourceAwareTaskOptimizationPass
17771772
<< graph.getTotalAllocatedCGRAs() << "\n";
17781773

17791774
if (!balance_changed && !fuse_changed) {
1780-
// Converged — writes ALL attributes (cgra_count, ii, steps) to IR
1781-
// for every task. Non-fused tasks only got cgra_count written during
1782-
// intermediate iterations; ii, steps, and trip_count live only in the
1783-
// graph node and must be persisted here.
1784-
//
1785-
// Note: no re-profiling is done here. When balance-skip-mapper=true
1786-
// (the default), the balance phase uses analytical estimates; those
1787-
// are the values written to the final IR. When
1788-
// balance-skip-mapper=false, the balance phase already ran the real
1789-
// mapper for each speculative probe, so the graph already contains
1790-
// accurate compiled_ii / steps values. Either way, the converged
1791-
// graph state is authoritative and written directly to IR.
1775+
// Converged — optionally re-profile with the real mapper for accuracy.
1776+
// When balance-skip-mapper=false, runs the mapper once per task that
1777+
// had its cgra_count changed, to get authoritative compiled_ii/steps.
1778+
bool balance_skip_mapper = use_analytical || balanceSkipMapper.getValue();
1779+
if (!balance_skip_mapper) {
1780+
llvm::errs() << "[ResourceAware] Running final mapper verification "
1781+
<< "for converged allocation...\n";
1782+
for (auto &node : graph.nodes) {
1783+
// Re-profile with the real mapper to get accurate II/steps.
1784+
node->shape = pickBestShape(node->cgra_count);
1785+
graph.profileTask(node.get(), node->op,
1786+
/*skip_mapper=*/false);
1787+
}
1788+
}
17921789

1790+
// Writes ALL attributes (cgra_count, ii, steps) to IR for every task.
17931791
for (auto &node : graph.nodes) {
17941792
OpBuilder b(node->op);
17951793
node->shape = pickBestShape(node->cgra_count);

test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -382,19 +382,7 @@ module attributes {} {
382382
// PLACEMENT: taskflow.task @Task_2
383383
// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 0 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}], write_sram_locations = [{col = 0 : i32, row = 1 : i32}]}
384384

385-
// CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
386-
// +---+---+---+---+
387-
// | 0 | 1 | . | . | Task_0_Task_1_utilfused (1x1, cgra_count=1)
388-
// +---+---+---+---+ Task_2 (1x1, cgra_count=1)
389-
// | . | . | . | . |
390-
// +---+---+---+---+
391-
// | . | . | . | . |
392-
// +---+---+---+---+
393-
// | . | . | . | . |
394-
// +---+---+---+---+
395-
// 0=Task_0_Task_1_utilfused, 1=Task_2; 2/16 CGRAs used
396-
397385
// RESOPT: taskflow.task @Task_0_Task_1_utilfused
398-
// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 3 : i32, steps = 5 : i32, tile_shape = "1x1", trip_count = 32 : i32}
386+
// RESOPT-SAME: {cgra_count = 2 : i32, compiled_ii = 3 : i32, steps = 5 : i32, tile_shape = "1x2", trip_count = 32 : i32}
399387
// RESOPT: taskflow.task @Task_2
400-
// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32, tile_shape = "1x1", trip_count = 32 : i32}
388+
// RESOPT-SAME: {cgra_count = 2 : i32, compiled_ii = 2 : i32, steps = 7 : i32, tile_shape = "1x2", trip_count = 32 : i32}

test/multi-cgra/taskflow/multi-nested/multi-nested.mlir

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -534,21 +534,10 @@ module attributes {} {
534534
// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 1 : i32}], read_sram_locations = [{col = 1 : i32, row = 1 : i32}, {col = 2 : i32, row = 1 : i32}], write_sram_locations = [{col = 1 : i32, row = 1 : i32}]}
535535

536536
// RESOPT: taskflow.task @Task_1
537-
// RESOPT: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 160 : i32
537+
// RESOPT: cgra_count = 2 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x2", trip_count = 160 : i32
538538
// RESOPT: taskflow.task @Task_0_Task_2_fused_Task_3_utilfused
539-
// RESOPT: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 5 : i32, tile_shape = "1x1", trip_count = 192 : i32
539+
// RESOPT: cgra_count = 2 : i32, compiled_ii = 2 : i32, steps = 5 : i32, tile_shape = "1x2", trip_count = 192 : i32
540540
// RESOPT: taskflow.task @Task_4
541-
// RESOPT: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 36 : i32
541+
// RESOPT: cgra_count = 2 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x2", trip_count = 36 : i32
542542
// RESOPT: return
543543

544-
// CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
545-
// +---+---+---+---+
546-
// | 0 | 1 | 2 | . | Task_1 (1x1, cgra_count=1)
547-
// +---+---+---+---+ Task_0_Task_2_fused_Task_3_utilfused (1x1, cgra_count=1)
548-
// | . | . | . | . | Task_4 (1x1, cgra_count=1)
549-
// +---+---+---+---+
550-
// | . | . | . | . |
551-
// +---+---+---+---+
552-
// | . | . | . | . |
553-
// +---+---+---+---+
554-
// 0=Task_1, 1=Task_0_Task_2_fused_Task_3_utilfused, 2=Task_4; 3/16 CGRAs used

test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -161,17 +161,6 @@ module {
161161
// PLACEMENT-SAME: task_mapping_info = {cgra_positions = [{col = 1 : i32, row = 0 : i32}], read_sram_locations = [{col = 1 : i32, row = 0 : i32}, {col = 1 : i32, row = 0 : i32}], write_sram_locations = [{col = 1 : i32, row = 0 : i32}]}
162162

163163
// RESOPT: taskflow.task @Task_0_Task_1_utilfused
164-
// RESOPT: cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x1", trip_count = 64 : i32
164+
// RESOPT: cgra_count = 2 : i32, compiled_ii = 2 : i32, steps = 4 : i32, tile_shape = "1x2", trip_count = 64 : i32
165165
// RESOPT: return
166166

167-
// CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
168-
// +---+---+---+---+
169-
// | 0 | . | . | . | row=0: Task_0_Task_1_utilfused (1x1, cgra_count=1)
170-
// +---+---+---+---+
171-
// | . | . | . | . |
172-
// +---+---+---+---+
173-
// | . | . | . | . |
174-
// +---+---+---+---+
175-
// | . | . | . | . |
176-
// +---+---+---+---+
177-
// 0=Task_0_Task_1_utilfused; 1/16 CGRAs used

test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir

Lines changed: 11 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@
3535
// RUN: --canonicalize-live-in \
3636
// RUN: --leverage-predicated-value \
3737
// RUN: --transform-ctrl-to-data-flow \
38-
// RUN: --fold-constant
39-
// RU: '--resource-aware-task-optimization=balance-skip-mapper=false' \
40-
// RU: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \
41-
// RU: -o %t.resopt.mlir
42-
// RU: FileCheck %s --input-file=%t.resopt.mlir --check-prefixes=RESOPT
38+
// RUN: --fold-constant \
39+
// RUN: '--resource-aware-task-optimization=balance-skip-mapper=false' \
40+
// RUN: --architecture-spec=%S/../../../arch_spec/architecture_with_counter.yaml \
41+
// RUN: -o %t.resopt.mlir
42+
// RUN: FileCheck %s --input-file=%t.resopt.mlir --check-prefixes=RESOPT
4343

4444

4545
module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
@@ -706,30 +706,15 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} {
706706

707707

708708
// RESOPT: taskflow.task @Task_1_Task_0_Task_2_utilfused_utilfused
709-
// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 4 : i32, steps = 3 : i32, tile_shape = "1x1", trip_count = 6400 : i32}
709+
// RESOPT-SAME: {cgra_count = 2 : i32, compiled_ii = 5 : i32, steps = 3 : i32, tile_shape = "1x2", trip_count = 6400 : i32}
710710
// RESOPT: taskflow.task @Task_3
711-
// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32, tile_shape = "1x1", trip_count = 2359296 : i32}
711+
// RESOPT-SAME: {cgra_count = 2 : i32, compiled_ii = 3 : i32, steps = 6 : i32, tile_shape = "1x2", trip_count = 2359296 : i32}
712712
// RESOPT: taskflow.task @Task_4_Task_5_fused_Task_7_utilfused
713-
// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 7 : i32, tile_shape = "1x1", trip_count = 6400 : i32}
713+
// RESOPT-SAME: {cgra_count = 2 : i32, compiled_ii = 4 : i32, steps = 7 : i32, tile_shape = "1x2", trip_count = 6400 : i32}
714714
// RESOPT: taskflow.task @Task_6_Task_8_utilfused
715-
// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 3 : i32, tile_shape = "1x1", trip_count = 4096 : i32}
715+
// RESOPT-SAME: {cgra_count = 2 : i32, compiled_ii = 3 : i32, steps = 3 : i32, tile_shape = "1x2", trip_count = 4096 : i32}
716716
// RESOPT: taskflow.task @Task_9
717-
// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 6 : i32, tile_shape = "1x1", trip_count = 2359296 : i32}
717+
// RESOPT-SAME: {cgra_count = 2 : i32, compiled_ii = 3 : i32, steps = 6 : i32, tile_shape = "1x2", trip_count = 2359296 : i32}
718718
// RESOPT: taskflow.task @Task_10_Task_11_Task_12_fused_fused
719-
// RESOPT-SAME: {cgra_count = 1 : i32, compiled_ii = 2 : i32, steps = 8 : i32, tile_shape = "1x1", trip_count = 4096 : i32}
719+
// RESOPT-SAME: {cgra_count = 2 : i32, compiled_ii = 7 : i32, steps = 8 : i32, tile_shape = "1x2", trip_count = 4096 : i32}
720720
// RESOPT: return
721-
722-
723-
// CGRA Tile Occupation after RESOPT (4x4 grid, col x row):
724-
// +---+---+---+---+
725-
// | 0 | 1 | 2 | 3 | row=0: 0=Task_1_..._utilfused, 1=Task_3, 2=Task_4_..._utilfused, 3=Task_6_Task_8_utilfused
726-
// +---+---+---+---+
727-
// | 4 | 5 | . | . | row=1: 4=Task_9, 5=Task_10_..._fused_fused
728-
// +---+---+---+---+
729-
// | . | . | . | . |
730-
// +---+---+---+---+
731-
// | . | . | . | . |
732-
// +---+---+---+---+
733-
// 0=Task_1_Task_0_Task_2_utilfused_utilfused, 1=Task_3, 2=Task_4_Task_5_fused_Task_7_utilfused
734-
// 3=Task_6_Task_8_utilfused, 4=Task_9, 5=Task_10_Task_11_Task_12_fused_fused
735-
// 6/16 CGRAs used

0 commit comments

Comments
 (0)