Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
7b59f0f
Initial work on compiler profiling.
mcourteaux Mar 8, 2026
deea8f8
Refine lambda argument requirements in IRMutator and IRVisitor
alexreinking Mar 8, 2026
6bed833
Early exit in the loop-checking visitor
alexreinking Mar 8, 2026
4ea11aa
Compute last_use in-line
alexreinking Mar 8, 2026
2e9f117
Avoid redundant FindBufferUsage in For loop visitor
alexreinking Mar 8, 2026
9094ea8
fixup! Compute last_use in-line
alexreinking Mar 8, 2026
de8dc95
More profiling stuff.
mcourteaux Mar 8, 2026
def7be2
Fix build when not compiling in profiling.
mcourteaux Mar 8, 2026
dfc98d9
Disable RTTI naming when it's not enabled in the build config.
mcourteaux Mar 9, 2026
23071b2
Merge remote-tracking branch 'origin/alexreinking/inject-host-copies-…
mcourteaux Mar 9, 2026
b297402
Cleanup.
mcourteaux Mar 9, 2026
25aef21
Annotate InjectHostDevBufferCopies
mcourteaux Mar 9, 2026
cd1488a
Annotate Bounds and AddImageChecks
mcourteaux Mar 9, 2026
01cb49c
More annotating.
mcourteaux Mar 9, 2026
ec468a4
Clang-format and makefile fix, and support no RTTI.
mcourteaux Mar 9, 2026
f833bfd
Missing header in makefile.
mcourteaux Mar 9, 2026
865c601
Merge branch 'main' into compiler-profiling
mcourteaux Mar 14, 2026
2f3c0f3
Remove Profiled<...> from all mutators/visitors.
mcourteaux Mar 14, 2026
8c7aaed
Strip PerformanceCounter and use chrono instead, for simplicity.
mcourteaux Mar 14, 2026
c53b74b
Ditch profiled_xxx in favor of a simple call to operator()(...)
mcourteaux Mar 14, 2026
5738ed9
Change the main entry point of visitors and mutators to operator().
mcourteaux Mar 14, 2026
e6f9238
Clang-format
mcourteaux Mar 14, 2026
6b39180
Merge branch 'main' into compiler-profiling
mcourteaux May 13, 2026
878b903
Sort
mcourteaux May 13, 2026
f1c111b
Fixes.
mcourteaux May 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ Halide_feature(WITH_DOCS "Halide's Doxygen documentation" OFF)
Halide_feature(WITH_PACKAGING "Halide's CMake package install rules" TOP_LEVEL)
Halide_feature(WITH_PYTHON_BINDINGS "Halide's native Python module (not the whole pip package)" ON
DEPENDS Halide_ENABLE_EXCEPTIONS AND Halide_ENABLE_RTTI)
Halide_feature(WITH_COMPILER_PROFILING "Enable internal compiler tracing" OFF)
Halide_feature(WITH_SERIALIZATION "Include experimental Serialization/Deserialization code" ON)
Halide_feature(WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING
"Intercepting JIT compilation with a serialization roundtrip, for test only"
Expand Down
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,7 @@ SOURCE_FILES = \
CodeGen_WebGPU_Dev.cpp \
CodeGen_X86.cpp \
CompilerLogger.cpp \
CompilerProfiling.cpp \
ConstantBounds.cpp \
ConstantInterval.cpp \
CPlusPlusMangle.cpp \
Expand Down Expand Up @@ -681,6 +682,7 @@ HEADER_FILES = \
CodeGen_Vulkan_Dev.h \
CodeGen_WebGPU_Dev.h \
CompilerLogger.h \
CompilerProfiling.h \
ConciseCasts.h \
ConstantBounds.h \
ConstantInterval.h \
Expand Down
2 changes: 2 additions & 0 deletions src/AddImageChecks.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "AddImageChecks.h"
#include "CompilerProfiling.h"
#include "ExternFuncArgument.h"
#include "Function.h"
#include "IRMutator.h"
Expand Down Expand Up @@ -162,6 +163,7 @@ Stmt add_image_checks_inner(Stmt s,
const map<string, Function> &env,
const FuncValueBounds &fb,
bool will_inject_host_copies) {
ZoneScoped;

bool no_bounds_query = t.has_feature(Target::NoBoundsQuery);

Expand Down
1 change: 1 addition & 0 deletions src/AddParameterChecks.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "AddParameterChecks.h"
#include "CompilerProfiling.h"
#include "IROperator.h"
#include "IRVisitor.h"
#include "Substitute.h"
Expand Down
1 change: 1 addition & 0 deletions src/AsyncProducers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1016,6 +1016,7 @@ class TightenForkNodes : public IRMutator {
} // namespace

Stmt fork_async_producers(Stmt s, const map<string, Function> &env) {
ZoneScoped;
s = TightenProducerConsumerNodes(env)(s);
s = InjectRingBuffering(env)(s);
s = ForkAsyncProducers(env)(s);
Expand Down
3 changes: 3 additions & 0 deletions src/Bounds.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ Expr find_constant_bound(const Expr &e, Direction d, const Scope<Interval> &scop
}

Interval find_constant_bounds(const Expr &e, const Scope<Interval> &scope) {
ZoneScoped;
Expr expr = bound_correlated_differences(simplify(remove_likelies(e)));
Interval interval = bounds_of_expr_in_scope(expr, scope, FuncValueBounds(), true);
interval = simplify(interval);
Expand Down Expand Up @@ -3021,6 +3022,7 @@ class BoxesTouched : public IRGraphVisitor {

map<string, Box> boxes_touched(const Expr &e, Stmt s, bool consider_calls, bool consider_provides,
const string &fn, const Scope<Interval> &scope, const FuncValueBounds &fb) {
ZoneScoped;
if (!fn.empty() && s.defined()) {
// Filter things down to the relevant sub-Stmts, so we don't spend a
// long time reasoning about lets and ifs that don't surround an
Expand Down Expand Up @@ -3276,6 +3278,7 @@ Interval compute_pure_function_definition_value_bounds(

FuncValueBounds compute_function_value_bounds(const vector<string> &order,
const map<string, Function> &env) {
ZoneScoped;
FuncValueBounds fb;

for (const auto &func_name : order) {
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ target_sources(
CodeGen_Vulkan_Dev.h
CodeGen_WebGPU_Dev.h
CompilerLogger.h
CompilerProfiling.h
ConciseCasts.h
ConstantBounds.h
ConstantInterval.h
Expand Down Expand Up @@ -274,6 +275,7 @@ target_sources(
CodeGen_WebGPU_Dev.cpp
CodeGen_X86.cpp
CompilerLogger.cpp
CompilerProfiling.cpp
ConstantBounds.cpp
ConstantInterval.cpp
CPlusPlusMangle.cpp
Expand Down
3 changes: 3 additions & 0 deletions src/CSE.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <map>

#include "CSE.h"
#include "CompilerProfiling.h"
#include "IREquality.h"
#include "IRMutator.h"
#include "IROperator.h"
Expand Down Expand Up @@ -291,6 +292,7 @@ class CSEEveryExprInStmt : public IRMutator {
} // namespace

Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
ZoneScoped;
Expr e = e_in;

// Early-out for trivial cases.
Expand Down Expand Up @@ -338,6 +340,7 @@ Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
};
UniqueNameProvider namer;
{
ZoneScopedN("UniqueNameProvider");
e.accept(&namer);
}

Expand Down
1 change: 1 addition & 0 deletions src/CodeGen_C.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ class TypeInfoGatherer : public IRGraphVisitor {

CodeGen_C::CodeGen_C(ostream &s, const Target &t, OutputKind output_kind, const std::string &guard)
: IRPrinter(s), id("$$ BAD ID $$"), target(t), output_kind(output_kind) {
ZoneScoped;

if (output_kind == CPlusPlusFunctionInfoHeader) {
// If it's a header, emit an include guard.
Expand Down
10 changes: 9 additions & 1 deletion src/CodeGen_LLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "CodeGen_LLVM.h"
#include "CodeGen_Targets.h"
#include "CompilerLogger.h"
#include "CompilerProfiling.h"
#include "Debug.h"
#include "Deinterleave.h"
#include "EmulateFloat16Math.h"
Expand Down Expand Up @@ -223,6 +224,7 @@ std::unique_ptr<CodeGen_LLVM> CodeGen_LLVM::new_for_target(const Target &target,
}

void CodeGen_LLVM::initialize_llvm() {
ZoneScoped;
static std::once_flag init_llvm_once;
std::call_once(init_llvm_once, []() {
// You can hack in command-line args to llvm with the
Expand Down Expand Up @@ -260,6 +262,7 @@ void CodeGen_LLVM::initialize_llvm() {
}

void CodeGen_LLVM::init_context() {
ZoneScoped;
// Ensure our IRBuilder is using the current context.
builder = std::make_unique<IRBuilder<>>(*context);

Expand Down Expand Up @@ -301,6 +304,7 @@ void CodeGen_LLVM::init_context() {
}

void CodeGen_LLVM::init_module() {
ZoneScoped;
init_context();

// Start with a module containing the initial module for this target.
Expand Down Expand Up @@ -496,6 +500,7 @@ CodeGen_LLVM::ScopedFastMath::~ScopedFastMath() {
}

std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
ZoneScoped;
any_strict_float = input.any_strict_float();

init_codegen(input.name());
Expand Down Expand Up @@ -625,6 +630,7 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
}

std::unique_ptr<llvm::Module> CodeGen_LLVM::finish_codegen() {
ZoneScoped;
llvm::for_each(*module, set_function_attributes_from_halide_target_options);

// Verify the module is ok
Expand Down Expand Up @@ -704,6 +710,7 @@ void CodeGen_LLVM::end_func(const std::vector<LoweredArgument> &args) {

void CodeGen_LLVM::compile_func(const LoweredFunc &f, const std::string &simple_name,
const std::string &extern_name) {
ZoneScoped;
// Generate the function declaration and argument unpacking code.
begin_func(f.linkage, simple_name, extern_name, f.args);

Expand Down Expand Up @@ -1133,6 +1140,7 @@ llvm::Type *CodeGen_LLVM::llvm_type_of(const Type &t) const {
}

void CodeGen_LLVM::optimize_module() {
ZoneScoped;
debug(3) << "Optimizing module\n";

auto time_start = std::chrono::high_resolution_clock::now();
Expand Down Expand Up @@ -3998,7 +4006,7 @@ void CodeGen_LLVM::visit(const For *op) {
// Pop the loop variable from the scope
sym_pop(op->name);
} else {
internal_error << "Unknown type of For node. Only Serial and Parallel For nodes should survive down to codegen.\n";
internal_error << "Unknown type of For node: " << op->for_type << ". Only Serial and Parallel For nodes should survive down to codegen.\n";
}
}

Expand Down
6 changes: 6 additions & 0 deletions src/CodeGen_Metal_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
public:
CodeGen_Metal_C(std::ostream &s, const Target &t)
: CodeGen_GPU_C(s, t) {
ZoneScoped;
abs_returns_unsigned_type = false;

#define alias(x, y) \
Expand Down Expand Up @@ -643,6 +644,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const FloatImm *op) {
void CodeGen_Metal_Dev::add_kernel(Stmt s,
const string &name,
const vector<DeviceArgument> &args) {
ZoneScoped;
debug(2) << "CodeGen_Metal_Dev::compile " << name << "\n";

// We need to scalarize/de-predicate any loads/stores, since Metal does not
Expand Down Expand Up @@ -676,6 +678,7 @@ struct BufferSize {
void CodeGen_Metal_Dev::CodeGen_Metal_C::add_kernel(const Stmt &s,
const string &name,
const vector<DeviceArgument> &args) {
ZoneScoped;
debug(2) << "Adding Metal kernel " << name << "\n";

// Figure out which arguments should be passed in constant.
Expand Down Expand Up @@ -825,6 +828,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::add_kernel(const Stmt &s,
}

void CodeGen_Metal_Dev::init_module() {
ZoneScoped;
debug(2) << "Metal device codegen init_module\n";

// wipe the internal kernel source
Expand Down Expand Up @@ -865,6 +869,7 @@ void CodeGen_Metal_Dev::init_module() {
}

vector<char> CodeGen_Metal_Dev::compile_to_src() {
ZoneScoped;
string str = src_stream.str();
debug(1) << "Metal kernel:\n"
<< str << "\n";
Expand Down Expand Up @@ -921,6 +926,7 @@ std::string CodeGen_Metal_Dev::print_gpu_name(const std::string &name) {
} // namespace

std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Metal_Dev(const Target &target) {
ZoneScoped;
return std::make_unique<CodeGen_Metal_Dev>(target);
}

Expand Down
2 changes: 2 additions & 0 deletions src/CodeGen_OpenCL_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -909,6 +909,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Atomic *op) {
void CodeGen_OpenCL_Dev::add_kernel(Stmt s,
const string &name,
const vector<DeviceArgument> &args) {
ZoneScoped;
debug(2) << "CodeGen_OpenCL_Dev::compile " << name << "\n";

// We need to scalarize/de-predicate any loads/stores, since OpenCL does not
Expand Down Expand Up @@ -943,6 +944,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s,
const string &name,
const vector<DeviceArgument> &args) {

ZoneScoped;
debug(2) << "Adding OpenCL kernel " << name << "\n";

debug(2) << "Eliminating bool vectors\n";
Expand Down
2 changes: 2 additions & 0 deletions src/CodeGen_PTX_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ Type CodeGen_PTX_Dev::upgrade_type_for_storage(const Type &t) const {
void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
const std::string &name,
const std::vector<DeviceArgument> &args) {
ZoneScoped;
internal_assert(module != nullptr);

debug(2) << "In CodeGen_PTX_Dev::add_kernel\n";
Expand Down Expand Up @@ -219,6 +220,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
}

void CodeGen_PTX_Dev::init_module() {
ZoneScoped;
// This class uses multiple inheritance. It's a GPU device code generator,
// and also an llvm-based one. Both of these track strict_float presence,
// but OffloadGPULoops only sets the GPU device code generator flag, so here
Expand Down
Loading
Loading