diff --git a/CMakeLists.txt b/CMakeLists.txt index 684e8f988cec..ad4e8ceaf6a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,6 +176,7 @@ Halide_feature(WITH_DOCS "Halide's Doxygen documentation" OFF) Halide_feature(WITH_PACKAGING "Halide's CMake package install rules" TOP_LEVEL) Halide_feature(WITH_PYTHON_BINDINGS "Halide's native Python module (not the whole pip package)" ON DEPENDS Halide_ENABLE_EXCEPTIONS AND Halide_ENABLE_RTTI) +Halide_feature(WITH_COMPILER_PROFILING "Enable internal compiler tracing" OFF) Halide_feature(WITH_SERIALIZATION "Include experimental Serialization/Deserialization code" ON) Halide_feature(WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING "Intercepting JIT compilation with a serialization roundtrip, for test only" diff --git a/Makefile b/Makefile index 522330034149..22b99f6e1f50 100644 --- a/Makefile +++ b/Makefile @@ -484,6 +484,7 @@ SOURCE_FILES = \ CodeGen_WebGPU_Dev.cpp \ CodeGen_X86.cpp \ CompilerLogger.cpp \ + CompilerProfiling.cpp \ ConstantBounds.cpp \ ConstantInterval.cpp \ CPlusPlusMangle.cpp \ @@ -681,6 +682,7 @@ HEADER_FILES = \ CodeGen_Vulkan_Dev.h \ CodeGen_WebGPU_Dev.h \ CompilerLogger.h \ + CompilerProfiling.h \ ConciseCasts.h \ ConstantBounds.h \ ConstantInterval.h \ diff --git a/src/AddImageChecks.cpp b/src/AddImageChecks.cpp index b9ef94564049..0d3863bdaa0d 100644 --- a/src/AddImageChecks.cpp +++ b/src/AddImageChecks.cpp @@ -1,4 +1,5 @@ #include "AddImageChecks.h" +#include "CompilerProfiling.h" #include "ExternFuncArgument.h" #include "Function.h" #include "IRMutator.h" @@ -162,6 +163,7 @@ Stmt add_image_checks_inner(Stmt s, const map &env, const FuncValueBounds &fb, bool will_inject_host_copies) { + ZoneScoped; bool no_bounds_query = t.has_feature(Target::NoBoundsQuery); diff --git a/src/AddParameterChecks.cpp b/src/AddParameterChecks.cpp index e63c4a6395d5..18a7dfb21556 100644 --- a/src/AddParameterChecks.cpp +++ b/src/AddParameterChecks.cpp @@ -1,4 +1,5 @@ #include "AddParameterChecks.h" +#include "CompilerProfiling.h" #include "IROperator.h" #include "IRVisitor.h" #include "Substitute.h" diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp index 4da23c58bd10..809eb9861a3d 100644 --- a/src/AsyncProducers.cpp +++ b/src/AsyncProducers.cpp @@ -1016,6 +1016,7 @@ class TightenForkNodes : public IRMutator { } // namespace Stmt fork_async_producers(Stmt s, const map &env) { + ZoneScoped; s = TightenProducerConsumerNodes(env)(s); s = InjectRingBuffering(env)(s); s = ForkAsyncProducers(env)(s); diff --git a/src/Bounds.cpp b/src/Bounds.cpp index 9087516b54b8..d8ca4330df37 100644 --- a/src/Bounds.cpp +++ b/src/Bounds.cpp @@ -119,6 +119,7 @@ Expr find_constant_bound(const Expr &e, Direction d, const Scope &scop } Interval find_constant_bounds(const Expr &e, const Scope &scope) { + ZoneScoped; Expr expr = bound_correlated_differences(simplify(remove_likelies(e))); Interval interval = bounds_of_expr_in_scope(expr, scope, FuncValueBounds(), true); interval = simplify(interval); @@ -3021,6 +3022,7 @@ class BoxesTouched : public IRGraphVisitor { map boxes_touched(const Expr &e, Stmt s, bool consider_calls, bool consider_provides, const string &fn, const Scope &scope, const FuncValueBounds &fb) { + ZoneScoped; if (!fn.empty() && s.defined()) { // Filter things down to the relevant sub-Stmts, so we don't spend a // long time reasoning about lets and ifs that don't surround an @@ -3276,6 +3278,7 @@ Interval compute_pure_function_definition_value_bounds( FuncValueBounds compute_function_value_bounds(const vector &order, const map &env) { + ZoneScoped; FuncValueBounds fb; for (const auto &func_name : order) { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7558e589fe75..11eaed230b11 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -87,6 +87,7 @@ target_sources( CodeGen_Vulkan_Dev.h CodeGen_WebGPU_Dev.h CompilerLogger.h + CompilerProfiling.h ConciseCasts.h ConstantBounds.h ConstantInterval.h @@ -274,6 +275,7 @@ target_sources( CodeGen_WebGPU_Dev.cpp CodeGen_X86.cpp CompilerLogger.cpp + CompilerProfiling.cpp ConstantBounds.cpp ConstantInterval.cpp CPlusPlusMangle.cpp diff --git a/src/CSE.cpp b/src/CSE.cpp index 95f61f0214e9..d86982782dbb 100644 --- a/src/CSE.cpp +++ b/src/CSE.cpp @@ -1,6 +1,7 @@ #include #include "CSE.h" +#include "CompilerProfiling.h" #include "IREquality.h" #include "IRMutator.h" #include "IROperator.h" @@ -291,6 +292,7 @@ class CSEEveryExprInStmt : public IRMutator { } // namespace Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) { + ZoneScoped; Expr e = e_in; // Early-out for trivial cases. @@ -338,6 +340,7 @@ Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) { }; UniqueNameProvider namer; { + ZoneScopedN("UniqueNameProvider"); e.accept(&namer); } diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp index 7b5a9792247e..61c73233a517 100644 --- a/src/CodeGen_C.cpp +++ b/src/CodeGen_C.cpp @@ -208,6 +208,7 @@ class TypeInfoGatherer : public IRGraphVisitor { CodeGen_C::CodeGen_C(ostream &s, const Target &t, OutputKind output_kind, const std::string &guard) : IRPrinter(s), id("$$ BAD ID $$"), target(t), output_kind(output_kind) { + ZoneScoped; if (output_kind == CPlusPlusFunctionInfoHeader) { // If it's a header, emit an include guard. diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 9263aef95f75..16b1935aca0d 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -9,6 +9,7 @@ #include "CodeGen_LLVM.h" #include "CodeGen_Targets.h" #include "CompilerLogger.h" +#include "CompilerProfiling.h" #include "Debug.h" #include "Deinterleave.h" #include "EmulateFloat16Math.h" @@ -223,6 +224,7 @@ std::unique_ptr CodeGen_LLVM::new_for_target(const Target &target, } void CodeGen_LLVM::initialize_llvm() { + ZoneScoped; static std::once_flag init_llvm_once; std::call_once(init_llvm_once, []() { // You can hack in command-line args to llvm with the @@ -260,6 +262,7 @@ void CodeGen_LLVM::initialize_llvm() { } void CodeGen_LLVM::init_context() { + ZoneScoped; // Ensure our IRBuilder is using the current context. builder = std::make_unique>(*context); @@ -301,6 +304,7 @@ void CodeGen_LLVM::init_context() { } void CodeGen_LLVM::init_module() { + ZoneScoped; init_context(); // Start with a module containing the initial module for this target. @@ -496,6 +500,7 @@ CodeGen_LLVM::ScopedFastMath::~ScopedFastMath() { } std::unique_ptr CodeGen_LLVM::compile(const Module &input) { + ZoneScoped; any_strict_float = input.any_strict_float(); init_codegen(input.name()); @@ -625,6 +630,7 @@ std::unique_ptr CodeGen_LLVM::compile(const Module &input) { } std::unique_ptr CodeGen_LLVM::finish_codegen() { + ZoneScoped; llvm::for_each(*module, set_function_attributes_from_halide_target_options); // Verify the module is ok @@ -704,6 +710,7 @@ void CodeGen_LLVM::end_func(const std::vector &args) { void CodeGen_LLVM::compile_func(const LoweredFunc &f, const std::string &simple_name, const std::string &extern_name) { + ZoneScoped; // Generate the function declaration and argument unpacking code. begin_func(f.linkage, simple_name, extern_name, f.args); @@ -1133,6 +1140,7 @@ llvm::Type *CodeGen_LLVM::llvm_type_of(const Type &t) const { } void CodeGen_LLVM::optimize_module() { + ZoneScoped; debug(3) << "Optimizing module\n"; auto time_start = std::chrono::high_resolution_clock::now(); @@ -3998,7 +4006,7 @@ void CodeGen_LLVM::visit(const For *op) { // Pop the loop variable from the scope sym_pop(op->name); } else { - internal_error << "Unknown type of For node. Only Serial and Parallel For nodes should survive down to codegen.\n"; + internal_error << "Unknown type of For node: " << op->for_type << ". Only Serial and Parallel For nodes should survive down to codegen.\n"; } } diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp index 61fbb8602c10..785a2eecfd5b 100644 --- a/src/CodeGen_Metal_Dev.cpp +++ b/src/CodeGen_Metal_Dev.cpp @@ -70,6 +70,7 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev { public: CodeGen_Metal_C(std::ostream &s, const Target &t) : CodeGen_GPU_C(s, t) { + ZoneScoped; abs_returns_unsigned_type = false; #define alias(x, y) \ @@ -643,6 +644,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const FloatImm *op) { void CodeGen_Metal_Dev::add_kernel(Stmt s, const string &name, const vector &args) { + ZoneScoped; debug(2) << "CodeGen_Metal_Dev::compile " << name << "\n"; // We need to scalarize/de-predicate any loads/stores, since Metal does not @@ -676,6 +678,7 @@ struct BufferSize { void CodeGen_Metal_Dev::CodeGen_Metal_C::add_kernel(const Stmt &s, const string &name, const vector &args) { + ZoneScoped; debug(2) << "Adding Metal kernel " << name << "\n"; // Figure out which arguments should be passed in constant. @@ -825,6 +828,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::add_kernel(const Stmt &s, } void CodeGen_Metal_Dev::init_module() { + ZoneScoped; debug(2) << "Metal device codegen init_module\n"; // wipe the internal kernel source @@ -865,6 +869,7 @@ void CodeGen_Metal_Dev::init_module() { } vector CodeGen_Metal_Dev::compile_to_src() { + ZoneScoped; string str = src_stream.str(); debug(1) << "Metal kernel:\n" << str << "\n"; @@ -921,6 +926,7 @@ std::string CodeGen_Metal_Dev::print_gpu_name(const std::string &name) { } // namespace std::unique_ptr new_CodeGen_Metal_Dev(const Target &target) { + ZoneScoped; return std::make_unique(target); } diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index 807d75444ed4..43d4e60c8a37 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -909,6 +909,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Atomic *op) { void CodeGen_OpenCL_Dev::add_kernel(Stmt s, const string &name, const vector &args) { + ZoneScoped; debug(2) << "CodeGen_OpenCL_Dev::compile " << name << "\n"; // We need to scalarize/de-predicate any loads/stores, since OpenCL does not @@ -943,6 +944,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s, const string &name, const vector &args) { + ZoneScoped; debug(2) << "Adding OpenCL kernel " << name << "\n"; debug(2) << "Eliminating bool vectors\n"; diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 0786ade82847..087447eceb94 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -129,6 +129,7 @@ Type CodeGen_PTX_Dev::upgrade_type_for_storage(const Type &t) const { void CodeGen_PTX_Dev::add_kernel(Stmt stmt, const std::string &name, const std::vector &args) { + ZoneScoped; internal_assert(module != nullptr); debug(2) << "In CodeGen_PTX_Dev::add_kernel\n"; @@ -219,6 +220,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt, } void CodeGen_PTX_Dev::init_module() { + ZoneScoped; // This class uses multiple inheritance. It's a GPU device code generator, // and also an llvm-based one. Both of these track strict_float presence, // but OffloadGPULoops only sets the GPU device code generator flag, so here diff --git a/src/CompilerProfiling.cpp b/src/CompilerProfiling.cpp new file mode 100644 index 000000000000..06af2b7907b8 --- /dev/null +++ b/src/CompilerProfiling.cpp @@ -0,0 +1,193 @@ +#include "CompilerProfiling.h" + +#ifdef WITH_COMPILER_PROFILING + +#include "Util.h" + +#if !defined(_WIN32) +#include +#else +#include +#pragma warning(disable : 4091) +#include +#pragma comment(lib, "dbghelp.lib") +#endif + +#include +#include +#include + +namespace Halide { +namespace Internal { +namespace Profiling { + +static std::string demangle(const char *name) { +#if !defined(_WIN32) + int status = 0; + char *p = abi::__cxa_demangle(name, nullptr, nullptr, &status); + std::string ret(p); + free(p); + return ret; +#else + char demangled_name[8192]; + if (UnDecorateSymbolName(name, demangled_name, sizeof(demangled_name), + UNDNAME_COMPLETE)) { + std::string ret(demangled_name); + return ret; + } else { + DWORD error = GetLastError(); + std::cout << "UnDecorateSymbolName error: " << error << std::endl; + return name; + } +#endif +} + +static std::string cleanup_name(std::string name) { + debug(2) << "Cleaned " << name << " => "; + { + std::string_view profiled = "Halide::Internal::Profiling::Profiled<"; + if (size_t idx = name.find(profiled); idx != std::string::npos) { + size_t cl = idx + 1; + int num_open = 1; + while (num_open != 0 && cl < name.size()) { + if (name[cl] == '<') { + num_open++; + } else if (name[cl] == '>') { + num_open--; + } + cl++; + } + name = name.substr(idx + profiled.size(), cl - idx - profiled.size() - 1); + } + } + name = replace_all(name, "Halide::Internal::", ""); + name = replace_all(name, "(anonymous namespace)::", ""); + debug(2) << name << "\n"; + return name; +} + +Context::Context() { + if (std::string file = get_env_variable("HL_COMPILER_TRACE_FILE"); !file.empty()) { + if (std::string bits = get_env_variable("HL_COMPILER_TRACE_BITS"); !bits.empty()) { + active_bits = BIT_GENERIC; + std::vector sp = split_string(bits, ","); + for (const std::string &s : sp) { + if (s == "stmt") { + active_bits |= BIT_STMT; + } else if (s == "expr") { + active_bits |= BIT_EXPR; + } + } + } else { + active_bits = BIT_GENERIC; + } + } +} +Context::~Context() { + if (std::string file = get_env_variable("HL_COMPILER_TRACE_FILE"); !file.empty()) { + Profiling::write_halide_profiling_trace(file); + } +} + +void write_halide_profiling_trace(const std::string &file) { + ZoneScoped; + std::lock_guard lock(ctx.mutex); // Ensure no threads are born while serializing + + debug(1) << "Emitting trace.json events: " << file << "\n"; + + // Pass 1: Find the absolute global t=0 across all threads + uint64_t global_start_cycles = static_cast(-1); + for (const auto &trace : ctx.traces) { + if (trace.events.empty()) { + continue; + } + + uint64_t first_cycle = trace.start_cycles_64; + uint32_t first_timer = trace.events[0].timer; + uint32_t anchor_lower = static_cast(first_cycle & 0xFFFFFFFF); + + // Did the 32-bit timer roll over between thread creation and the first event? + if (first_timer < anchor_lower) { + first_cycle += 0x100000000ULL; + } + first_cycle = (first_cycle & 0xFFFFFFFF00000000ULL) | first_timer; + + if (first_cycle < global_start_cycles) { + global_start_cycles = first_cycle; + } + } + + std::map demangled_names; + + std::ofstream out(file); + out << "[\n"; + bool first = true; + + for (const auto &trace : ctx.traces) { + uint64_t current_cycles = trace.start_cycles_64; + uint32_t last_timer = static_cast(current_cycles & 0xFFFFFFFF); + + for (const auto &ev : trace.events) { + // Reconstruct the 64-bit timeline for this specific thread + if (ev.timer < last_timer) { + current_cycles += 0x1'0000'0000ULL; + } + current_cycles = (current_cycles & 0xFFFFFFFF00000000ULL) | ev.timer; + last_timer = ev.timer; + + double ts_us = static_cast(current_cycles - global_start_cycles) * 1e-3; + + if (!first) { + out << ",\n"; + } + first = false; + + out << " {"; + if (ev.event_type == 0) { + const char *src_tag = ev.src_tag; + if ( +#if !defined(_WIN32) + std::strncmp(src_tag, "N6Halide", 8) == 0 +#else + std::strncmp(src_tag, "class Halide", 12) == 0 || + std::strncmp(src_tag, "struct Halide", 13) == 0 +#endif + ) { + if (auto it = demangled_names.find(src_tag); it == demangled_names.end()) { + std::string dn = demangle(src_tag); + dn = cleanup_name(dn); + auto result = demangled_names.emplace(src_tag, std::move(dn)); + src_tag = result.first->second.c_str(); + } else { + src_tag = it->second.c_str(); + } + } + // Only add strings to the start event. + if (ev.tag == Event::Visitor) { + const char *node_name = IRNodeType_string((IRNodeType)ev.data); + out << "\"name\": \"" << node_name << "\", "; + out << "\"cat\": \"" << src_tag << "\", "; + } else if (ev.tag == Event::Generic) { + out << "\"name\": \"" << src_tag << "\", "; + out << "\"args\": {"; + out << "\"data\": \"" << ev.data << "\""; + out << "}, "; + } + } + out << "\"ph\": \"" << (ev.event_type == 0 ? "B" : "E") << "\", "; + out << "\"pid\": 1, "; + out << "\"tid\": " << trace.tid << ", "; // Same TID forces perfectly nested flame-graph boxes + out << "\"ts\": " << std::fixed << std::setprecision(3) << ts_us; + + out << "}\n"; // Let's put a newline to not kill bad parsers. + } + } + + out << "\n]\n"; +} + +} // namespace Profiling +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/CompilerProfiling.h b/src/CompilerProfiling.h new file mode 100644 index 000000000000..228b98bce3bf --- /dev/null +++ b/src/CompilerProfiling.h @@ -0,0 +1,245 @@ +#ifndef HALIDE_PROFILED_IR_VISITOR_H +#define HALIDE_PROFILED_IR_VISITOR_H + +#ifdef WITH_COMPILER_PROFILING + +#include "IR.h" + +#include +#include +#include +#include + +#ifndef __FUNCTION_NAME__ +#ifdef WIN32 // WINDOWS +#define __FUNCTION_NAME__ __FUNCTION__ +#else //*NIX +#define __FUNCTION_NAME__ __func__ +#endif +#endif + +/** \file + * Defines the base class for things that recursively walk over the IR + */ + +namespace Halide { +namespace Internal { +namespace Profiling { + +constexpr uint8_t BIT_GENERIC = 1 << 0; +constexpr uint8_t BIT_STMT = 1 << 1; +constexpr uint8_t BIT_EXPR = 1 << 2; + +inline uint64_t performance_counter() { + auto now = std::chrono::high_resolution_clock::now().time_since_epoch(); + return static_cast(std::chrono::duration_cast(now).count()); +} + +struct Event { + const char *src_tag; + uint32_t timer; + uint8_t event_type : 1; // 0 for Start, 1 for Stop + + enum Tag : uint8_t { + Generic, + Visitor, + } tag : 1; + + /** Optional for visitors and mutators. */ + uint32_t data : 30; +}; +static_assert(sizeof(Event) == sizeof(void *) + 8); + +struct ThreadTrace { + uint32_t tid; + uint64_t start_cycles_64; // 64-bit anchor to align threads accurately + std::vector events; +}; + +struct Context { + std::mutex mutex; + uint8_t active_bits{0}; + + // std::list guarantees that pointers to elements are never invalidated + std::list traces; + + Context(); + ~Context(); +}; + +inline Context ctx; + +inline ThreadTrace &init_thread_profiler() { + std::lock_guard lock(ctx.mutex); + + // Hash thread ID to get a clean 32-bit integer for the Chrome UI + uint32_t tid = static_cast(std::hash{}(std::this_thread::get_id())); + uint64_t anchor = performance_counter(); + + ctx.traces.push_back({tid, anchor, {}}); + ThreadTrace &tt = ctx.traces.back(); + + // Pre-allocate ~16MB per thread to guarantee zero allocations during hot loops + tt.events.reserve(1'000'000); + + return tt; +} + +inline thread_local ThreadTrace &tls_thread_trace = init_thread_profiler(); +inline thread_local std::vector *tls_profiling_events = &tls_thread_trace.events; + +inline void zone_begin(const char *src_tag, Event::Tag tag, unsigned data = 0) { + Event pe; + pe.timer = static_cast(performance_counter()); + pe.tag = tag; + pe.src_tag = src_tag; + pe.event_type = 0; // Start + pe.data = data; + tls_profiling_events->push_back(pe); +} + +inline void zone_end(const char *src_tag, Event::Tag tag, unsigned data = 0) { + Event pe; + pe.timer = static_cast(performance_counter()); + pe.tag = tag; + pe.src_tag = src_tag; + pe.event_type = 1; // Stop + pe.data = data; + tls_profiling_events->push_back(pe); +} + +inline void generic_zone_begin(const char *src_tag, unsigned data = 0) { + if (ctx.active_bits & BIT_GENERIC) { + zone_begin(src_tag, Event::Tag::Generic, data); + } +} + +inline void generic_zone_end(const char *src_tag, unsigned data = 0) { + if (ctx.active_bits & BIT_GENERIC) { + zone_end(src_tag, Event::Tag::Generic, data); + } +} + +struct ZoneScopedVisitor_ { + IRNodeType node_type; + const char *src_tag; + bool active; + + ZoneScopedVisitor_(IRNodeType node_type, const char *src_tag, uint8_t activation_bits) + : node_type(node_type), src_tag(src_tag), active(ctx.active_bits & activation_bits) { + if (active) { + zone_begin(src_tag, Event::Visitor, (unsigned)node_type); + } + } + + ZoneScopedVisitor_(const Expr &e, const char *src_tag) + : node_type(e.defined() ? e->node_type : IRNodeType::IntImm), + src_tag(src_tag), + active((ctx.active_bits & BIT_EXPR) && e.defined()) { + if (active) { + zone_begin(src_tag, Event::Visitor, (unsigned)node_type); + } + } + + ZoneScopedVisitor_(const Stmt &s, const char *src_tag) + : node_type(s.defined() ? s->node_type : IRNodeType::IntImm), + src_tag(src_tag), + active((ctx.active_bits & BIT_STMT) && s.defined()) { + if (active) { + zone_begin(src_tag, Event::Visitor, (unsigned)node_type); + } + } + + ~ZoneScopedVisitor_() { + if (active) { + zone_end(src_tag, Event::Visitor, (unsigned)node_type); + } + } +}; + +struct ZoneScoped_ { + Event::Tag tag; + const char *src_tag; + + ZoneScoped_(const char *src_tag, unsigned data = 0, Event::Tag tag = Event::Tag::Generic) + : tag(tag), + src_tag(src_tag) { + zone_begin(src_tag, tag, data); + } + + ~ZoneScoped_() { + zone_end(src_tag, tag, 0); + } +}; + +#define ZoneScoped \ + Halide::Internal::Profiling::ZoneScoped_ __zone_scoped(__FUNCTION_NAME__) +#define ZoneScopedN(...) \ + Halide::Internal::Profiling::ZoneScoped_ __zone_scoped(__VA_ARGS__) +#define ZoneScopedVisitor(...) \ + Halide::Internal::Profiling::ZoneScopedVisitor_ __zone_scoped(__VA_ARGS__) + +#ifdef HALIDE_ENABLE_RTTI +#define HalideVisitorDynamicNameTag typeid(*this).name() +#else +#define HalideVisitorDynamicNameTag __FUNCTION__ +#endif + +template +class Profiled : public Base { +public: + using Base::Base; +#ifdef HALIDE_ENABLE_RTTI + const char *tag = typeid(Base).name(); +#else + const char *tag = "Unknown (no RTTI)"; +#endif + +#define PROFILE_VISIT_STMT_OVERRIDE(T) \ + auto visit(const T *op) -> decltype(this->Base::visit(op)) override { \ + ZoneScopedVisitor_ _prof(IRNodeType::T, tag, BIT_STMT); \ + return Base::visit(op); \ + } + HALIDE_FOR_EACH_IR_STMT(PROFILE_VISIT_STMT_OVERRIDE) + +#undef PROFILE_VISIT_STMT_OVERRIDE + +#define PROFILE_VISIT_EXPR_OVERRIDE(T) \ + auto visit(const T *op) -> decltype(this->Base::visit(op)) override { \ + ZoneScopedVisitor_ _prof(IRNodeType::T, tag, BIT_EXPR); \ + return Base::visit(op); \ + } + HALIDE_FOR_EACH_IR_EXPR(PROFILE_VISIT_EXPR_OVERRIDE) + +#undef PROFILE_VISIT_EXPR_OVERRIDE +}; + +void write_halide_profiling_trace(const std::string &file); + +} // namespace Profiling + +template +using Profiled = Profiling::Profiled; + +} // namespace Internal +} // namespace Halide + +#else + +namespace Profiling { +inline void generic_zone_begin(const char *src_tag, unsigned data = 0) { +} +inline void generic_zone_end(const char *src_tag, unsigned data = 0) { +} +} // namespace Profiling + +template +using Profiled = Base; +#define ZoneScoped +#define ZoneScopedN(...) +#define ZoneScopedVisitor(...) +#define HalideVisitorDynamicNameTag + +#endif + +#endif // HALIDE_PROFILED_IR_VISITOR_H diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp index ec85e5a383f9..a8d447bd391c 100644 --- a/src/FuseGPUThreadLoops.cpp +++ b/src/FuseGPUThreadLoops.cpp @@ -1603,6 +1603,7 @@ class NormalizeIfStatements : public IRMutator { } // namespace Stmt fuse_gpu_thread_loops(Stmt s) { + ZoneScoped; // NormalizeIfStatements pushes the predicates between GPU blocks // into the innermost GPU block. FuseGPUThreadLoops would then // merge the predicate into the merged GPU thread. diff --git a/src/IRMutator.h b/src/IRMutator.h index 5a389fc4870d..ab502f3edade 100644 --- a/src/IRMutator.h +++ b/src/IRMutator.h @@ -9,6 +9,7 @@ #include #include +#include "CompilerProfiling.h" #include "IR.h" namespace Halide { @@ -35,15 +36,18 @@ class IRMutator { * sub-statements. */ inline Expr operator()(const Expr &expr) { + ZoneScopedN(HalideVisitorDynamicNameTag); return mutate(expr); } inline Stmt operator()(const Stmt &stmt) { + ZoneScopedN(HalideVisitorDynamicNameTag); return mutate(stmt); } // Like mutate_with_changes, but discard the changes flag. std::vector operator()(const std::vector &exprs) { + ZoneScopedN(HalideVisitorDynamicNameTag); return mutate_with_changes(exprs).first; } @@ -92,9 +96,11 @@ class IRGraphMutator : public IRMutator { public: inline Expr operator()(const Expr &expr) { + ZoneScopedN(HalideVisitorDynamicNameTag); return mutate(expr); } inline Stmt operator()(const Stmt &stmt) { + ZoneScopedN(HalideVisitorDynamicNameTag); return mutate(stmt); } }; @@ -134,15 +140,17 @@ struct LambdaMutator final : IRMutator { } protected: -#define HALIDE_CALL_VISIT_EXPR_IMPL(T) \ - Expr visit(const T *op) override { \ - return this->visit_impl(op); \ +#define HALIDE_CALL_VISIT_EXPR_IMPL(T) \ + Expr visit(const T *op) override { \ + ZoneScopedVisitor(IRNodeType::T, "LambdaMutator", Profiling::BIT_EXPR); \ + return this->visit_impl(op); \ } HALIDE_FOR_EACH_IR_EXPR(HALIDE_CALL_VISIT_EXPR_IMPL) #undef HALIDE_CALL_VISIT_EXPR_IMPL -#define HALIDE_CALL_VISIT_STMT_IMPL(T) \ - Stmt visit(const T *op) override { \ - return this->visit_impl(op); \ +#define HALIDE_CALL_VISIT_STMT_IMPL(T) \ + Stmt visit(const T *op) override { \ + ZoneScopedVisitor(IRNodeType::T, "LambdaMutator", Profiling::BIT_STMT); \ + return this->visit_impl(op); \ } HALIDE_FOR_EACH_IR_STMT(HALIDE_CALL_VISIT_STMT_IMPL) #undef HALIDE_CALL_VISIT_STMT_IMPL diff --git a/src/IROperator.cpp b/src/IROperator.cpp index c729539daa29..bd5319944f26 100644 --- a/src/IROperator.cpp +++ b/src/IROperator.cpp @@ -1075,18 +1075,22 @@ T remove_intrinsics(const T &e, const std::initializer_list & } // namespace Expr remove_likelies(const Expr &e) { + ZoneScoped; return remove_intrinsics(e, {Call::likely, Call::likely_if_innermost}); } Stmt remove_likelies(const Stmt &s) { + ZoneScoped; return remove_intrinsics(s, {Call::likely, Call::likely_if_innermost}); } Expr remove_promises(const Expr &e) { + ZoneScoped; return remove_intrinsics(e, {Call::promise_clamped, Call::unsafe_promise_clamped}); } Stmt remove_promises(const Stmt &s) { + ZoneScoped; return remove_intrinsics(s, {Call::promise_clamped, Call::unsafe_promise_clamped}); } diff --git a/src/IRVisitor.h b/src/IRVisitor.h index 322f2bff6ba3..be0df5168e73 100644 --- a/src/IRVisitor.h +++ b/src/IRVisitor.h @@ -3,6 +3,7 @@ #include +#include "CompilerProfiling.h" #include "IR.h" /** \file @@ -22,15 +23,18 @@ class IRVisitor { virtual ~IRVisitor() = default; inline void operator()(const Stmt &s) { + ZoneScopedN(HalideVisitorDynamicNameTag); s.accept(this); } inline void operator()(const Expr &e) { + ZoneScopedN(HalideVisitorDynamicNameTag); e.accept(this); } template inline void operator()(const T *op) { + ZoneScopedN(HalideVisitorDynamicNameTag); visit(op); } @@ -120,9 +124,11 @@ void visit_with(const IRHandle &ir, Lambdas &&...lambdas) { class IRGraphVisitor : public IRVisitor { public: inline void operator()(const Expr &e) { + ZoneScopedN(HalideVisitorDynamicNameTag); include(e); } inline void operator()(const Stmt &s) { + ZoneScopedN(HalideVisitorDynamicNameTag); include(s); } @@ -160,11 +166,20 @@ class IRGraphVisitor : public IRVisitor { template class VariadicVisitor { private: +#ifdef WITH_COMPILER_PROFILING +#ifdef HALIDE_ENABLE_RTTI + const char *name = typeid(T).name(); +#else + const char *name = "VariadicVisitor"; +#endif +#endif + template ExprRet dispatch_expr(const BaseExprNode *node, Args &&...args) { if (node == nullptr) { return ExprRet{}; } + ZoneScopedVisitor(node->node_type, name, Profiling::BIT_EXPR); switch (node->node_type) { #define HALIDE_SWITCH_EXPR(NT) \ case IRNodeType::NT: \ @@ -183,6 +198,7 @@ class VariadicVisitor { if (node == nullptr) { return StmtRet{}; } + ZoneScopedVisitor(node->node_type, name, Profiling::BIT_STMT); switch (node->node_type) { #define HALIDE_SWITCH_STMT(NT) \ case IRNodeType::NT: \ diff --git a/src/InjectHostDevBufferCopies.cpp b/src/InjectHostDevBufferCopies.cpp index 6e8b21686657..c3b57b0c8435 100644 --- a/src/InjectHostDevBufferCopies.cpp +++ b/src/InjectHostDevBufferCopies.cpp @@ -1,6 +1,7 @@ #include "InjectHostDevBufferCopies.h" #include "CodeGen_GPU_Dev.h" +#include "CompilerProfiling.h" #include "Debug.h" #include "ExternFuncArgument.h" #include "IRMutator.h" @@ -217,6 +218,7 @@ class InjectBufferCopiesForSingleBuffer : public IRMutator { } Stmt do_copies(Stmt s, FindBufferUsage *precomputed = nullptr) { + ZoneScoped; // Sniff what happens to the buffer inside the stmt FindBufferUsage local_finder(buffer, DeviceAPI::Host); if (!precomputed) { @@ -570,6 +572,7 @@ class InjectBufferCopies : public IRMutator { }; Stmt inject_free_after_last_use(Stmt body, const Stmt &last_use, const Stmt &free_stmt) { + ZoneScoped; bool success = false; body = mutate_with( body, @@ -762,6 +765,7 @@ class InjectBufferCopiesForInputsAndOutputs : public IRMutator { finder(s); Stmt new_stmt = s; for (const string &buf : finder.result) { + ZoneScopedN("InjectBufferCopiesForSingleBuffer"); new_stmt = InjectBufferCopiesForSingleBuffer(buf, true, finder.result_storage.at(buf))(new_stmt); } return new_stmt; diff --git a/src/JITModule.cpp b/src/JITModule.cpp index 4bdba15bdf25..02d8f293feeb 100644 --- a/src/JITModule.cpp +++ b/src/JITModule.cpp @@ -285,6 +285,7 @@ namespace { // Retrieve a function pointer from an llvm module, possibly by compiling it. JITModule::Symbol compile_and_get_function(llvm::orc::LLJIT &JIT, const string &name) { + ZoneScoped; debug(2) << "JIT Compiling " << name << "\n"; auto addr = JIT.lookup(name); @@ -341,6 +342,7 @@ JITModule::JITModule() { JITModule::JITModule(const Module &m, const LoweredFunc &fn, const std::vector &dependencies) { + ZoneScoped; jit_module = new JITModuleContents(); std::unique_ptr llvm_module(compile_module_to_llvm_module(m, *jit_module->context)); std::vector deps_with_runtime = dependencies; @@ -358,6 +360,7 @@ void compile_module_impl( std::unique_ptr m, const string &function_name, const Target &target, const std::vector &dependencies, const std::vector &requested_exports) { + ZoneScoped; // Ensure that LLVM is initialized CodeGen_LLVM::initialize_llvm(); @@ -532,6 +535,7 @@ void compile_module_impl( void JITModule::compile_module(std::unique_ptr m, const string &function_name, const Target &target, const std::vector &dependencies, const std::vector &requested_exports) { + ZoneScoped; // LLJIT's SimpleCompiler triggers LLVM's AsmPrinter, which can use a large // amount of stack (observed stack overflows on macOS worker threads with // 512KB default stacks). Use run_with_large_stack to ensure enough space. diff --git a/src/LICM.cpp b/src/LICM.cpp index d0ee80aad177..a34c194cecb5 100644 --- a/src/LICM.cpp +++ b/src/LICM.cpp @@ -1,5 +1,6 @@ #include "LICM.h" #include "CSE.h" +#include "CompilerProfiling.h" #include "ExprUsesVar.h" #include "IREquality.h" #include "IRMutator.h" @@ -525,6 +526,7 @@ class GroupLoopInvariants : public IRMutator { } // namespace Stmt hoist_loop_invariant_values(Stmt s) { + ZoneScoped; s = GroupLoopInvariants()(s); s = common_subexpression_elimination(s); s = LICM()(s); diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp index 6eaf7496f178..6e65018035ef 100644 --- a/src/LLVM_Runtime_Linker.cpp +++ b/src/LLVM_Runtime_Linker.cpp @@ -1,4 +1,5 @@ #include "LLVM_Runtime_Linker.h" +#include "CompilerProfiling.h" #include "Error.h" #include "LLVM_Headers.h" #include "Target.h" @@ -11,6 +12,7 @@ using std::vector; namespace { std::unique_ptr parse_bitcode_file(llvm::StringRef buf, llvm::LLVMContext *context, const char *id) { + ZoneScoped; llvm::MemoryBufferRef bitcode_buffer = llvm::MemoryBufferRef(buf, id); @@ -618,6 +620,7 @@ void convert_weak_to_linkonce(llvm::GlobalValue &gv) { // triple appropriately for the target. void link_modules(std::vector> &modules, Target t, bool allow_stripping_all_weak_functions = false) { + ZoneScoped; llvm::DataLayout data_layout = get_data_layout_for_target(t); llvm::Triple triple = Internal::get_triple_for_target(t); @@ -846,6 +849,7 @@ namespace Internal { std::unique_ptr link_with_wasm_jit_runtime(llvm::LLVMContext *c, const Target &t, std::unique_ptr extra_module) { + ZoneScoped; bool bits_64 = (t.bits == 64); bool debug = t.has_feature(Target::Debug); @@ -889,6 +893,7 @@ std::unique_ptr link_with_wasm_jit_runtime(llvm::LLVMContext *c, c /** Create an llvm module containing the support code for a given target. */ std::unique_ptr get_initial_module_for_target(Target t, llvm::LLVMContext *c, bool for_shared_jit_runtime, bool just_gpu) { + ZoneScoped; enum InitialModuleType { ModuleAOT, ModuleAOTNoRuntime, @@ -1381,6 +1386,7 @@ std::unique_ptr get_initial_module_for_target(Target t, llvm::LLVM #ifdef WITH_NVPTX std::unique_ptr get_initial_module_for_ptx_device(Target target, llvm::LLVMContext *c) { + ZoneScoped; std::vector> modules; modules.push_back(get_initmod_ptx_dev_ll(c)); @@ -1434,6 +1440,7 @@ std::unique_ptr get_initial_module_for_ptx_device(Target target, l void add_bitcode_to_module(llvm::LLVMContext *context, llvm::Module &module, const std::vector &bitcode, const std::string &name) { + ZoneScoped; llvm::StringRef sb = llvm::StringRef((const char *)&bitcode[0], bitcode.size()); std::unique_ptr add_in = parse_bitcode_file(sb, context, name.c_str()); diff --git a/src/Lower.cpp b/src/Lower.cpp index c248684ea5d9..ac34766fc9fb 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -21,6 +21,7 @@ #include "CanonicalizeGPUVars.h" #include "ClampUnsafeAccesses.h" #include "CompilerLogger.h" +#include "CompilerProfiling.h" #include "Debug.h" #include "DebugArguments.h" #include "DebugToFile.h" @@ -94,29 +95,52 @@ namespace { class LoweringLogger { Stmt last_written; std::chrono::time_point last_time; + const char *last_msg; + std::vector> timings; bool time_lowering_passes = false; public: LoweringLogger() { - last_time = std::chrono::high_resolution_clock::now(); static bool should_time = !get_env_variable("HL_TIME_LOWERING_PASSES").empty(); time_lowering_passes = should_time; } - void operator()(const string &message, const Stmt &s) { + void begin(const char *msg) { + debug(1) << "Lowering pass: " << msg << "...\n"; + Profiling::generic_zone_begin(msg); + last_time = std::chrono::high_resolution_clock::now(); + last_msg = msg; + } + + void begin(const char *msg, int data) { + debug(1) << "Lowering pass: " << msg << " " << data << "...\n"; + Profiling::generic_zone_begin(msg, data); + last_time = std::chrono::high_resolution_clock::now(); + last_msg = msg; + } + + void end() { + Profiling::generic_zone_end(last_msg); auto t = std::chrono::high_resolution_clock::now(); std::chrono::duration diff = t - last_time; + timings.emplace_back(diff.count() * 1000, last_msg); + } + + void end(const Stmt &s) { + Profiling::generic_zone_end(last_msg); + auto t = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = t - last_time; + timings.emplace_back(diff.count() * 1000, last_msg); if (!s.same_as(last_written)) { - debug(2) << message << "\n" + debug(2) << "Lowering after " << last_msg << "\n" << s << "\n"; last_written = s; last_time = t; } else { - debug(2) << message << " (unchanged)\n\n"; + debug(2) << "Lowering after " << last_msg << " (unchanged)\n\n"; last_time = t; } - timings.emplace_back(diff.count() * 1000, message); } ~LoweringLogger() { @@ -143,6 +167,7 @@ void lower_impl(const vector &output_funcs, bool trace_pipeline, const vector &custom_passes, Module &result_module) { + ZoneScoped; auto time_start = std::chrono::high_resolution_clock::now(); size_t initial_lowered_function_count = result_module.functions().size(); @@ -173,293 +198,294 @@ void lower_impl(const vector &output_funcs, LoweringLogger log; - debug(1) << "Creating initial loop nests...\n"; + log.begin("Creating initial loop nests"); bool any_memoized = false; Stmt s = schedule_functions(outputs, fused_groups, env, t, any_memoized); - log("Lowering after creating initial loop nests:", s); + log.end(s); if (any_memoized) { - debug(1) << "Injecting memoization...\n"; + log.begin("Injecting memoization"); s = inject_memoization(s, env, pipeline_name, outputs); - log("Lowering after injecting memoization:", s); + log.end(s); } else { debug(1) << "Skipping injecting memoization...\n"; } - debug(1) << "Injecting tracing...\n"; + log.begin("Injecting tracing"); s = inject_tracing(s, pipeline_name, trace_pipeline, env, outputs, t); - log("Lowering after injecting tracing:", s); + log.end(s); - debug(1) << "Adding checks for parameters\n"; + log.begin("Adding checks for parameters"); s = add_parameter_checks(requirements, s, t); - log("Lowering after injecting parameter checks:", s); + log.end(s); // Compute the maximum and minimum possible value of each // function. Used in later bounds inference passes. - debug(1) << "Computing bounds of each function's value\n"; + log.begin("Computing bounds of each function's value"); FuncValueBounds func_bounds = compute_function_value_bounds(order, env); + log.end(); // Clamp unsafe instances where a Func f accesses a Func g using // an index which depends on a third Func h. - debug(1) << "Clamping unsafe data-dependent accesses\n"; + log.begin("Clamping unsafe data-dependent accesses"); s = clamp_unsafe_accesses(s, env, func_bounds); - log("Lowering after clamping unsafe data-dependent accesses", s); + log.end(s); // This pass injects nested definitions of variable names, so we // can't simplify statements from here until we fix them up. (We // can still simplify Exprs). - debug(1) << "Performing computation bounds inference...\n"; + log.begin("Bounds inference"); s = bounds_inference(s, outputs, order, fused_groups, env, func_bounds, t); - log("Lowering after computation bounds inference:", s); + log.end(s); - debug(1) << "Asserting that all split factors are positive...\n"; + log.begin("Asserting that all split factors are positive"); s = add_split_factor_checks(s, env); - log("Lowering after asserting that all split factors are positive:", s); + log.end(s); - debug(1) << "Removing extern loops...\n"; + log.begin("Removing extern loops"); s = remove_extern_loops(s); - log("Lowering after removing extern loops:", s); + log.end(s); - debug(1) << "Performing sliding window optimization...\n"; + log.begin("Sliding window optimization"); s = sliding_window(s, env); - log("Lowering after sliding window:", s); + log.end(s); // This uniquifies the variable names, so we're good to simplify // after this point. This lets later passes assume syntactic // equivalence means semantic equivalence. - debug(1) << "Uniquifying variable names...\n"; + log.begin("Uniquifying variable names"); s = uniquify_variable_names(s); - log("Lowering after uniquifying variable names:", s); + log.end(s); - debug(1) << "Simplifying...\n"; + log.begin("Simplifying"); s = simplify(s); - log("Lowering after first simplification:", s); + log.end(s); - debug(1) << "Simplifying correlated differences...\n"; + log.begin("Simplifying correlated differences"); s = simplify_correlated_differences(s); - log("Lowering after simplifying correlated differences:", s); + log.end(s); - debug(1) << "Performing allocation bounds inference...\n"; + log.begin("Allocation bounds inference"); s = allocation_bounds_inference(s, env, func_bounds); - log("Lowering after allocation bounds inference:", s); + log.end(s); bool will_inject_host_copies = (t.has_gpu_feature() || t.has_feature(Target::HexagonDma) || (t.arch != Target::Hexagon && (t.has_feature(Target::HVX)))); - debug(1) << "Adding checks for images\n"; + log.begin("Adding checks for images"); s = add_image_checks(s, outputs, t, order, env, func_bounds, will_inject_host_copies); - log("Lowering after injecting image checks:", s); + log.end(s); - debug(1) << "Removing code that depends on undef values...\n"; + log.begin("Removing code that depends on undef values"); s = remove_undef(s); - log("Lowering after removing code that depends on undef values:", s); + log.end(s); - debug(1) << "Performing storage folding optimization...\n"; + log.begin("Performing storage folding optimization"); s = storage_folding(s, env); - log("Lowering after storage folding:", s); + log.end(s); - debug(1) << "Injecting debug_to_file calls...\n"; + log.begin("Injecting debug_to_file calls"); s = debug_to_file(s, outputs, env); - log("Lowering after injecting debug_to_file calls:", s); + log.end(s); - debug(1) << "Injecting prefetches...\n"; + log.begin("Injecting prefetches"); s = inject_prefetch(s, env); - log("Lowering after injecting prefetches:", s); + log.end(s); - debug(1) << "Discarding safe promises...\n"; + log.begin("Discarding safe promises"); s = lower_safe_promises(s); - log("Lowering after discarding safe promises:", s); + log.end(s); - debug(1) << "Dynamically skipping stages...\n"; + log.begin("Dynamically skipping stages"); s = skip_stages(s, outputs, fused_groups, env); - log("Lowering after dynamically skipping stages:", s); + log.end(s); - debug(1) << "Forking asynchronous producers...\n"; + log.begin("Forking asynchronous producers"); s = fork_async_producers(s, env); - log("Lowering after forking asynchronous producers:", s); + log.end(s); - debug(1) << "Destructuring tuple-valued realizations...\n"; + log.begin("Destructuring tuple-valued realizations"); s = split_tuples(s, env); - log("Lowering after destructuring tuple-valued realizations:", s); + log.end(s); if (t.has_gpu_feature()) { - debug(1) << "Canonicalizing GPU var names...\n"; + log.begin("Canonicalizing GPU var names"); s = canonicalize_gpu_vars(s); - log("Lowering after canonicalizing GPU var names:", s); + log.end(s); } - debug(1) << "Bounding small realizations...\n"; + log.begin("Bounding small realizations"); s = simplify_correlated_differences(s); s = bound_small_allocations(s); - log("Lowering after bounding small realizations:", s); + log.end(s); - debug(1) << "Performing storage flattening...\n"; + log.begin("Performing storage flattening"); s = storage_flattening(s, outputs, env, t); - log("Lowering after storage flattening:", s); + log.end(s); - debug(1) << "Adding atomic mutex allocation...\n"; + log.begin("Adding atomic mutex allocation"); s = add_atomic_mutex(s, outputs); - log("Lowering after adding atomic mutex allocation:", s); + log.end(s); - debug(1) << "Unpacking buffer arguments...\n"; + log.begin("Unpacking buffer arguments"); s = unpack_buffers(s); - log("Lowering after unpacking buffer arguments:", s); + log.end(s); if (any_memoized) { - debug(1) << "Rewriting memoized allocations...\n"; + log.begin("Rewriting memoized allocations"); s = rewrite_memoized_allocations(s, env); - log("Lowering after rewriting memoized allocations:", s); + log.end(s); } else { - debug(1) << "Skipping rewriting memoized allocations...\n"; + debug(1) << "Skipping rewriting memoized allocations\n"; } if (will_inject_host_copies) { - debug(1) << "Selecting a GPU API for GPU loops...\n"; + log.begin("Selecting a GPU API for GPU loops"); s = select_gpu_api(s, t); - log("Lowering after selecting a GPU API:", s); + log.end(s); - debug(1) << "Injecting host <-> dev buffer copies...\n"; + log.begin("Injecting host <-> dev buffer copies"); s = inject_host_dev_buffer_copies(s, t); - log("Lowering after injecting host <-> dev buffer copies:", s); + log.end(s); - debug(1) << "Selecting a GPU API for extern stages...\n"; + log.begin("Selecting a GPU API for extern stages"); s = select_gpu_api(s, t); - log("Lowering after selecting a GPU API for extern stages:", s); + log.end(s); } else { - debug(1) << "Injecting host-dirty marking...\n"; + log.begin("Injecting host-dirty marking"); s = inject_host_dev_buffer_copies(s, t); - log("Lowering after injecting host-dirty marking:", s); + log.end(s); } - debug(1) << "Simplifying...\n"; + log.begin("Simplifying"); s = simplify(s); s = unify_duplicate_lets(s); - log("Lowering after second simplification:", s); + log.end(s); - debug(1) << "Reduce prefetch dimension...\n"; + log.begin("Reduce prefetch dimension"); s = reduce_prefetch_dimension(s, t); - log("Lowering after reduce prefetch dimension:", s); + log.end(s); - debug(1) << "Simplifying correlated differences...\n"; + log.begin("Simplifying correlated differences"); s = simplify_correlated_differences(s); - log("Lowering after simplifying correlated differences:", s); + log.end(s); - debug(1) << "Bounding constant extent loops...\n"; + log.begin("Bounding constant extent loops"); s = bound_constant_extent_loops(s); - log("Lowering after bounding constant extent loops:", s); + log.end(s); - debug(1) << "Unrolling...\n"; + log.begin("Unrolling"); s = unroll_loops(s); - log("Lowering after unrolling:", s); + log.end(s); - debug(1) << "Vectorizing...\n"; + log.begin("Vectorizing"); s = vectorize_loops(s, env); s = simplify(s); - log("Lowering after vectorizing:", s); + log.end(s); if (t.has_gpu_feature() || t.has_feature(Target::Vulkan)) { - debug(1) << "Injecting per-block gpu synchronization...\n"; + log.begin("Injecting per-block gpu synchronization"); s = fuse_gpu_thread_loops(s); - log("Lowering after injecting per-block gpu synchronization:", s); + log.end(s); } - debug(1) << "Detecting vector interleavings...\n"; + log.begin("Detecting vector interleavings"); s = rewrite_interleavings(s); s = simplify(s); - log("Lowering after rewriting vector interleavings:", s); + log.end(s); - debug(1) << "Partitioning loops to simplify boundary conditions...\n"; + log.begin("Partitioning loops to simplify boundary conditions"); s = partition_loops(s); s = simplify(s); - log("Lowering after partitioning loops:", s); + log.end(s); - debug(1) << "Staging strided loads...\n"; + log.begin("Staging strided loads"); s = stage_strided_loads(s, t); - log("Lowering after staging strided loads:", s); + log.end(s); - debug(1) << "Trimming loops to the region over which they do something...\n"; + log.begin("Trimming loops to the region over which they do something"); s = trim_no_ops(s); - log("Lowering after loop trimming:", s); + log.end(s); - debug(1) << "Rebasing loops to zero...\n"; + log.begin("Rebasing loops to zero"); s = rebase_loops_to_zero(s); - debug(2) << "Lowering after rebasing loops to zero:\n" - << s << "\n\n"; + log.end(s); - debug(1) << "Hoisting loop invariant if statements...\n"; + log.begin("Hoisting loop invariant if statements"); s = hoist_loop_invariant_if_statements(s); - log("Lowering after hoisting loop invariant if statements:", s); + log.end(s); - debug(1) << "Injecting early frees...\n"; + log.begin("Injecting early frees"); s = inject_early_frees(s); - log("Lowering after injecting early frees:", s); + log.end(s); if (t.has_feature(Target::FuzzFloatStores)) { - debug(1) << "Fuzzing floating point stores...\n"; + log.begin("Fuzzing floating point stores"); s = fuzz_float_stores(s); - log("Lowering after fuzzing floating point stores:", s); + log.end(s); } - debug(1) << "Simplifying correlated differences...\n"; + log.begin("Simplifying correlated differences"); s = simplify_correlated_differences(s); - log("Lowering after simplifying correlated differences:", s); + log.end(s); - debug(1) << "Bounding small allocations...\n"; + log.begin("Bounding small allocations"); s = bound_small_allocations(s); - log("Lowering after bounding small allocations:", s); + log.end(s); if (t.has_feature(Target::Profile) || t.has_feature(Target::ProfileByTimer)) { - debug(1) << "Injecting profiling...\n"; + log.begin("Injecting profiling"); s = inject_profiling(s, pipeline_name, env); - log("Lowering after injecting profiling:", s); + log.end(s); } if (t.has_feature(Target::CUDA)) { - debug(1) << "Injecting warp shuffles...\n"; + log.begin("Injecting warp shuffles"); s = lower_warp_shuffles(s, t); - log("Lowering after injecting warp shuffles:", s); + log.end(s); } - debug(1) << "Simplifying...\n"; + log.begin("Simplifying"); s = common_subexpression_elimination(s); + log.end(); - debug(1) << "Lowering unsafe promises...\n"; + log.begin("Lowering unsafe promises"); s = lower_unsafe_promises(s, t); - log("Lowering after lowering unsafe promises:", s); + log.end(s); if (t.has_feature(Target::AVX512_SapphireRapids)) { - debug(1) << "Extracting tile operations...\n"; + log.begin("Extracting tile operations"); s = extract_tile_operations(s); - log("Lowering after extracting tile operations:", s); + log.end(s); } - debug(1) << "Flattening nested ramps...\n"; + log.begin("Flattening nested ramps"); s = flatten_nested_ramps(s); - log("Lowering after flattening nested ramps:", s); + log.end(s); - debug(1) << "Removing dead allocations and moving loop invariant code...\n"; + log.begin("Removing dead allocations and moving loop invariant code"); s = remove_dead_allocations(s); s = simplify(s); s = hoist_loop_invariant_values(s); s = hoist_loop_invariant_if_statements(s); - log("Lowering after removing dead allocations and hoisting loop invariants:", s); + log.end(s); - debug(1) << "Finding intrinsics...\n"; + log.begin("Finding intrinsics"); // Must be run after the last simplification, because it turns // divisions into shifts, which the simplifier reverses. s = find_intrinsics(s); - log("Lowering after finding intrinsics:", s); + log.end(s); - debug(1) << "Hoisting prefetches...\n"; + log.begin("Hoisting prefetches"); s = hoist_prefetches(s); - log("Lowering after hoisting prefetches:", s); + log.end(s); if (t.has_feature(Target::NoAsserts)) { - debug(1) << "Stripping asserts...\n"; + log.begin("Stripping asserts"); s = strip_asserts(s); - log("Lowering after stripping asserts:", s); + log.end(s); } debug(1) << "Lowering after final simplification:\n" @@ -467,10 +493,9 @@ void lower_impl(const vector &output_funcs, if (!custom_passes.empty()) { for (size_t i = 0; i < custom_passes.size(); i++) { - debug(1) << "Running custom lowering pass " << i << "...\n"; - s = (*custom_passes[i])(s); - debug(1) << "Lowering after custom pass " << i << ":\n" - << s << "\n\n"; + log.begin("Custom lowering pass", i); + s = custom_passes[i]->operator()(s); + log.end(s); } } @@ -478,19 +503,17 @@ void lower_impl(const vector &output_funcs, result_module.set_conceptual_code_stmt(s); if (t.arch != Target::Hexagon && t.has_feature(Target::HVX)) { - debug(1) << "Splitting off Hexagon offload...\n"; + log.begin("Splitting off Hexagon offload"); s = inject_hexagon_rpc(s, t, result_module); - debug(2) << "Lowering after splitting off Hexagon offload:\n" - << s << "\n"; + log.end(s); } else { debug(1) << "Skipping Hexagon offload...\n"; } if (t.has_gpu_feature()) { - debug(1) << "Offloading GPU loops...\n"; + log.begin("Offloading GPU loops"); s = inject_gpu_offload(s, t, any_strict_float); - debug(2) << "Lowering after splitting off GPU loops:\n" - << s << "\n\n"; + log.end(s); } else { debug(1) << "Skipping GPU offload...\n"; } @@ -505,7 +528,7 @@ void lower_impl(const vector &output_funcs, vector inferred_args = infer_arguments(s, outputs); std::vector closure_implementations; - debug(1) << "Lowering Parallel Tasks...\n"; + log.begin("Lowering Parallel Tasks"); s = lower_parallel_tasks(s, closure_implementations, pipeline_name, t); // Process any LoweredFunctions added by other passes. In practice, this // will likely not work well enough due to ordering issues with @@ -520,8 +543,7 @@ void lower_impl(const vector &output_funcs, for (auto &lowered_func : closure_implementations) { result_module.append(lowered_func); } - debug(2) << "Lowering after generating parallel tasks and closures:\n" - << s << "\n\n"; + log.end(s); vector public_args = args; for (const auto &out : outputs) { diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index 7351a038e019..85364cc345b5 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -9,6 +9,7 @@ #include "CodeGen_PTX_Dev.h" #include "CodeGen_Vulkan_Dev.h" #include "CodeGen_WebGPU_Dev.h" +#include "CompilerProfiling.h" #include "ExprUsesVar.h" #include "IRMutator.h" #include "IROperator.h" @@ -248,6 +249,7 @@ class InjectGpuOffload : public IRMutator { public: InjectGpuOffload(const Target &target, bool any_strict_float) : target(target) { + ZoneScoped; Target device_target = target; // For the GPU target we just want to pass the flags, to avoid the // generated kernel code unintentionally having any dependence on the @@ -281,12 +283,13 @@ class InjectGpuOffload : public IRMutator { } Stmt inject(const Stmt &s) { + ZoneScoped; // Create a new module for all of the kernels we find in this function. for (auto &i : cgdev) { i.second->init_module(); } - Stmt result = mutate(s); + Stmt result = (*this)(s); for (auto &i : cgdev) { string api_unique_name = i.second->api_unique_name(); diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp index f4dca6ee3292..7f3fd7ceef9b 100644 --- a/src/Pipeline.cpp +++ b/src/Pipeline.cpp @@ -4,6 +4,7 @@ #include "Argument.h" #include "Callable.h" #include "CodeGen_Internal.h" +#include "CompilerProfiling.h" #include "Deserialization.h" #include "FindCalls.h" #include "Func.h" @@ -495,6 +496,7 @@ Module Pipeline::compile_to_module(const vector &args, const string &fn_name, const Target &target, const LinkageType linkage_type) { + ZoneScoped; user_assert(defined()) << "Can't compile undefined Pipeline.\n"; for (const Function &f : contents->outputs) { @@ -580,6 +582,7 @@ Target Pipeline::get_compiled_jit_target() const { } void Pipeline::compile_jit(const Target &target_arg) { + ZoneScoped; user_assert(defined()) << "Pipeline is undefined\n"; Target target = target_arg; @@ -645,6 +648,7 @@ void Pipeline::compile_jit(const Target &target_arg) { } Callable Pipeline::compile_to_callable(const std::vector &args_in, const Target &target_arg) { + ZoneScoped; user_assert(defined()) << "Pipeline is undefined\n"; Target target = target_arg.with_feature(Target::JIT).with_feature(Target::UserContext); @@ -676,6 +680,7 @@ Callable Pipeline::compile_to_callable(const std::vector &args_in, con const std::vector &outputs, const std::map &jit_externs_in, const Target &target_arg) { + ZoneScoped; user_assert(!target_arg.has_unknowns()) << "Cannot jit-compile for target '" << target_arg << "'\n"; Target jit_target = target_arg.with_feature(Target::JIT).with_feature(Target::UserContext); diff --git a/src/Prefetch.cpp b/src/Prefetch.cpp index 6ab41b3deade..d0d27d9e47be 100644 --- a/src/Prefetch.cpp +++ b/src/Prefetch.cpp @@ -445,12 +445,14 @@ Stmt inject_placeholder_prefetch(const Stmt &s, const map &env } Stmt inject_prefetch(const Stmt &s, const map &env) { + ZoneScoped; CollectExternalBufferBounds finder; s.accept(&finder); return InjectPrefetch(env, finder.buffers)(s); } Stmt reduce_prefetch_dimension(Stmt stmt, const Target &t) { + ZoneScoped; size_t max_dim = 0; Expr max_byte_size; diff --git a/src/Simplify.cpp b/src/Simplify.cpp index a7d7d7f6faa0..d1ac6d828efb 100644 --- a/src/Simplify.cpp +++ b/src/Simplify.cpp @@ -3,6 +3,7 @@ #include "CSE.h" #include "CompilerLogger.h" +#include "CompilerProfiling.h" #include "IRMutator.h" #include "Substitute.h" @@ -16,6 +17,7 @@ using std::string; using std::vector; Simplify::Simplify(const Scope *bi, const Scope *ai) { + ZoneScoped; // Only respect the constant bounds from the containing scope. for (auto iter = bi->cbegin(); iter != bi->cend(); ++iter) { ExprInfo info; @@ -323,6 +325,7 @@ template T substitute_facts_impl(const T &t, const std::set &truths, const std::set &falsehoods) { + ZoneScoped; return mutate_with(t, [&](auto *self, const Expr &e) { if (e.type().is_bool()) { if (truths.count(e)) { @@ -363,6 +366,7 @@ Expr simplify(const Expr &e, const Scope &bounds, const Scope &alignment, const std::vector &assumptions) { + ZoneScoped; Simplify m(&bounds, &alignment); std::vector facts; facts.reserve(assumptions.size()); @@ -380,6 +384,7 @@ Stmt simplify(const Stmt &s, const Scope &bounds, const Scope &alignment, const std::vector &assumptions) { + ZoneScoped; Simplify m(&bounds, &alignment); std::vector facts; facts.reserve(assumptions.size()); @@ -402,10 +407,12 @@ class SimplifyExprs : public IRMutator { }; Stmt simplify_exprs(const Stmt &s) { + ZoneScoped; return SimplifyExprs().mutate(s); } bool can_prove(Expr e, const Scope &bounds) { + ZoneScoped; internal_assert(e.type().is_bool()) << "Argument to can_prove is not a boolean Expr: " << e << "\n"; diff --git a/src/Solve.cpp b/src/Solve.cpp index 204ec7eab366..67ea54a1c1b0 100644 --- a/src/Solve.cpp +++ b/src/Solve.cpp @@ -1210,6 +1210,7 @@ class SolveForInterval : public IRVisitor { } // Anonymous namespace SolverResult solve_expression(const Expr &e, const std::string &variable, const Scope &scope) { + ZoneScoped; SolveExpression solver(variable, scope); Expr new_e = solver(e); // The process has expanded lets. Re-collect them. diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index da14d1b52530..0e5dd17c1fa4 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -626,6 +626,7 @@ Stmt storage_flattening(Stmt s, const vector &outputs, const map &env, const Target &target) { + ZoneScoped; s = zero_gpu_loop_mins(s); // Make an environment that makes it easier to figure out which diff --git a/src/Substitute.cpp b/src/Substitute.cpp index 3781914944cf..df71101d0b07 100644 --- a/src/Substitute.cpp +++ b/src/Substitute.cpp @@ -140,10 +140,12 @@ auto substitute_impl(const Expr &find, const Expr &replacement, const T &ir) { } // namespace Expr substitute(const Expr &find, const Expr &replacement, const Expr &expr) { + ZoneScoped; return substitute_impl(find, replacement, expr); } Stmt substitute(const Expr &find, const Expr &replacement, const Stmt &stmt) { + ZoneScoped; return substitute_impl(find, replacement, stmt); } @@ -204,18 +206,22 @@ class GraphSubstituteExpr : public IRGraphMutator { } // namespace Expr graph_substitute(const string &name, const Expr &replacement, const Expr &expr) { + ZoneScoped; return GraphSubstitute(name, replacement)(expr); } Stmt graph_substitute(const string &name, const Expr &replacement, const Stmt &stmt) { + ZoneScoped; return GraphSubstitute(name, replacement)(stmt); } Expr graph_substitute(const Expr &find, const Expr &replacement, const Expr &expr) { + ZoneScoped; return GraphSubstituteExpr(find, replacement)(expr); } Stmt graph_substitute(const Expr &find, const Expr &replacement, const Stmt &stmt) { + ZoneScoped; return GraphSubstituteExpr(find, replacement)(stmt); } @@ -235,10 +241,12 @@ class SubstituteInAllLets : public IRGraphMutator { } // namespace Expr substitute_in_all_lets(const Expr &expr) { + ZoneScoped; return SubstituteInAllLets()(expr); } Stmt substitute_in_all_lets(const Stmt &stmt) { + ZoneScoped; return SubstituteInAllLets()(stmt); } diff --git a/src/UniquifyVariableNames.cpp b/src/UniquifyVariableNames.cpp index 6eec453afc63..2c6cc2e60178 100644 --- a/src/UniquifyVariableNames.cpp +++ b/src/UniquifyVariableNames.cpp @@ -165,6 +165,7 @@ class FindFreeVars : public IRVisitor { } // namespace Stmt uniquify_variable_names(const Stmt &s) { + ZoneScoped; FindFreeVars finder; s.accept(&finder); return UniquifyVariableNames(&finder.free_vars)(s); diff --git a/src/UnrollLoops.cpp b/src/UnrollLoops.cpp index 127dac785dea..48ec31bb655b 100644 --- a/src/UnrollLoops.cpp +++ b/src/UnrollLoops.cpp @@ -1,4 +1,5 @@ #include "UnrollLoops.h" +#include "CompilerProfiling.h" #include "IRMutator.h" #include "IROperator.h" #include "Simplify.h" @@ -16,6 +17,7 @@ class UnrollLoops : public IRMutator { Stmt visit(const For *for_loop) override { if (for_loop->for_type == ForType::Unrolled) { + ZoneScopedN("UnrollForLoop"); Stmt body = for_loop->body; Expr extent = simplify(for_loop->extent()); const IntImm *e = extent.as(); diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp index f97099287003..059ca0f8c8d6 100644 --- a/src/VectorizeLoops.cpp +++ b/src/VectorizeLoops.cpp @@ -1663,6 +1663,7 @@ class AllStoresInScope : public IRVisitor { } }; bool all_stores_in_scope(const Stmt &stmt, const Scope<> &scope) { + ZoneScoped; AllStoresInScope checker(scope); stmt.accept(&checker); return checker.result; @@ -1711,6 +1712,7 @@ Stmt vectorize_statement(const Stmt &stmt) { } // namespace Stmt vectorize_loops(const Stmt &stmt, const map &env) { + ZoneScoped; // Limit the scope of atomic nodes to just the necessary stuff. // TODO: Should this be an earlier pass? It's probably a good idea // for non-vectorizing stuff too.