diff --git a/CMakeLists.txt b/CMakeLists.txt
index 684e8f988cec..ad4e8ceaf6a1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -176,6 +176,7 @@ Halide_feature(WITH_DOCS "Halide's Doxygen documentation" OFF)
 Halide_feature(WITH_PACKAGING "Halide's CMake package install rules" TOP_LEVEL)
 Halide_feature(WITH_PYTHON_BINDINGS "Halide's native Python module (not the whole pip package)" ON
                DEPENDS Halide_ENABLE_EXCEPTIONS AND Halide_ENABLE_RTTI)
+Halide_feature(WITH_COMPILER_PROFILING "Enable internal compiler tracing" OFF)
 Halide_feature(WITH_SERIALIZATION "Include experimental Serialization/Deserialization code" ON)
 Halide_feature(WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING
                "Intercepting JIT compilation with a serialization roundtrip, for test only"
diff --git a/Makefile b/Makefile
index 522330034149..22b99f6e1f50 100644
--- a/Makefile
+++ b/Makefile
@@ -484,6 +484,7 @@ SOURCE_FILES = \
   CodeGen_WebGPU_Dev.cpp \
   CodeGen_X86.cpp \
   CompilerLogger.cpp \
+  CompilerProfiling.cpp \
   ConstantBounds.cpp \
   ConstantInterval.cpp \
   CPlusPlusMangle.cpp \
@@ -681,6 +682,7 @@ HEADER_FILES = \
   CodeGen_Vulkan_Dev.h \
   CodeGen_WebGPU_Dev.h \
   CompilerLogger.h \
+  CompilerProfiling.h \
   ConciseCasts.h \
   ConstantBounds.h \
   ConstantInterval.h \
diff --git a/src/AddImageChecks.cpp b/src/AddImageChecks.cpp
index b9ef94564049..0d3863bdaa0d 100644
--- a/src/AddImageChecks.cpp
+++ b/src/AddImageChecks.cpp
@@ -1,4 +1,5 @@
 #include "AddImageChecks.h"
+#include "CompilerProfiling.h"
 #include "ExternFuncArgument.h"
 #include "Function.h"
 #include "IRMutator.h"
@@ -162,6 +163,7 @@ Stmt add_image_checks_inner(Stmt s,
                             const map<string, Function> &env,
                             const FuncValueBounds &fb,
                             bool will_inject_host_copies) {
+    ZoneScoped;
 
     bool no_bounds_query = t.has_feature(Target::NoBoundsQuery);
 
diff --git a/src/AddParameterChecks.cpp b/src/AddParameterChecks.cpp
index e63c4a6395d5..18a7dfb21556 100644
--- a/src/AddParameterChecks.cpp
+++ b/src/AddParameterChecks.cpp
@@ -1,4 +1,5 @@
 #include "AddParameterChecks.h"
+#include "CompilerProfiling.h"
 #include "IROperator.h"
 #include "IRVisitor.h"
 #include "Substitute.h"
diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp
index 4da23c58bd10..809eb9861a3d 100644
--- a/src/AsyncProducers.cpp
+++ b/src/AsyncProducers.cpp
@@ -1016,6 +1016,7 @@ class TightenForkNodes : public IRMutator {
 }  // namespace
 
 Stmt fork_async_producers(Stmt s, const map<string, Function> &env) {
+    ZoneScoped;
     s = TightenProducerConsumerNodes(env)(s);
     s = InjectRingBuffering(env)(s);
     s = ForkAsyncProducers(env)(s);
diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index 9087516b54b8..d8ca4330df37 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -119,6 +119,7 @@ Expr find_constant_bound(const Expr &e, Direction d, const Scope<Interval> &scop
 }
 
 Interval find_constant_bounds(const Expr &e, const Scope<Interval> &scope) {
+    ZoneScoped;
     Expr expr = bound_correlated_differences(simplify(remove_likelies(e)));
     Interval interval = bounds_of_expr_in_scope(expr, scope, FuncValueBounds(), true);
     interval = simplify(interval);
@@ -3021,6 +3022,7 @@ class BoxesTouched : public IRGraphVisitor {
 
 map<string, Box> boxes_touched(const Expr &e, Stmt s, bool consider_calls, bool consider_provides,
                                const string &fn, const Scope<Interval> &scope, const FuncValueBounds &fb) {
+    ZoneScoped;
     if (!fn.empty() && s.defined()) {
         // Filter things down to the relevant sub-Stmts, so we don't spend a
         // long time reasoning about lets and ifs that don't surround an
@@ -3276,6 +3278,7 @@ Interval compute_pure_function_definition_value_bounds(
 
 FuncValueBounds compute_function_value_bounds(const vector<string> &order,
                                               const map<string, Function> &env) {
+    ZoneScoped;
     FuncValueBounds fb;
 
     for (const auto &func_name : order) {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7558e589fe75..11eaed230b11 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -87,6 +87,7 @@ target_sources(
     CodeGen_Vulkan_Dev.h
     CodeGen_WebGPU_Dev.h
     CompilerLogger.h
+    CompilerProfiling.h
     ConciseCasts.h
     ConstantBounds.h
     ConstantInterval.h
@@ -274,6 +275,7 @@ target_sources(
     CodeGen_WebGPU_Dev.cpp
     CodeGen_X86.cpp
     CompilerLogger.cpp
+    CompilerProfiling.cpp
     ConstantBounds.cpp
     ConstantInterval.cpp
     CPlusPlusMangle.cpp
diff --git a/src/CSE.cpp b/src/CSE.cpp
index 95f61f0214e9..d86982782dbb 100644
--- a/src/CSE.cpp
+++ b/src/CSE.cpp
@@ -1,6 +1,7 @@
 #include <map>
 
 #include "CSE.h"
+#include "CompilerProfiling.h"
 #include "IREquality.h"
 #include "IRMutator.h"
 #include "IROperator.h"
@@ -291,6 +292,7 @@ class CSEEveryExprInStmt : public IRMutator {
 }  // namespace
 
 Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
+    ZoneScoped;
     Expr e = e_in;
 
     // Early-out for trivial cases.
@@ -338,6 +340,7 @@ Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
     };
     UniqueNameProvider namer;
     {
+        ZoneScopedN("UniqueNameProvider");
         e.accept(&namer);
     }
 
diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index 7b5a9792247e..61c73233a517 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -208,6 +208,7 @@ class TypeInfoGatherer : public IRGraphVisitor {
 
 CodeGen_C::CodeGen_C(ostream &s, const Target &t, OutputKind output_kind, const std::string &guard)
     : IRPrinter(s), id("$$ BAD ID $$"), target(t), output_kind(output_kind) {
+    ZoneScoped;
 
     if (output_kind == CPlusPlusFunctionInfoHeader) {
         // If it's a header, emit an include guard.
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 9263aef95f75..16b1935aca0d 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -9,6 +9,7 @@
 #include "CodeGen_LLVM.h"
 #include "CodeGen_Targets.h"
 #include "CompilerLogger.h"
+#include "CompilerProfiling.h"
 #include "Debug.h"
 #include "Deinterleave.h"
 #include "EmulateFloat16Math.h"
@@ -223,6 +224,7 @@ std::unique_ptr<CodeGen_LLVM> CodeGen_LLVM::new_for_target(const Target &target,
 }
 
 void CodeGen_LLVM::initialize_llvm() {
+    ZoneScoped;
     static std::once_flag init_llvm_once;
     std::call_once(init_llvm_once, []() {
         // You can hack in command-line args to llvm with the
@@ -260,6 +262,7 @@ void CodeGen_LLVM::initialize_llvm() {
 }
 
 void CodeGen_LLVM::init_context() {
+    ZoneScoped;
     // Ensure our IRBuilder is using the current context.
     builder = std::make_unique<IRBuilder<>>(*context);
 
@@ -301,6 +304,7 @@ void CodeGen_LLVM::init_context() {
 }
 
 void CodeGen_LLVM::init_module() {
+    ZoneScoped;
     init_context();
 
     // Start with a module containing the initial module for this target.
@@ -496,6 +500,7 @@ CodeGen_LLVM::ScopedFastMath::~ScopedFastMath() {
 }
 
 std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
+    ZoneScoped;
     any_strict_float = input.any_strict_float();
 
     init_codegen(input.name());
@@ -625,6 +630,7 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
 }
 
 std::unique_ptr<llvm::Module> CodeGen_LLVM::finish_codegen() {
+    ZoneScoped;
     llvm::for_each(*module, set_function_attributes_from_halide_target_options);
 
     // Verify the module is ok
@@ -704,6 +710,7 @@ void CodeGen_LLVM::end_func(const std::vector<LoweredArgument> &args) {
 
 void CodeGen_LLVM::compile_func(const LoweredFunc &f, const std::string &simple_name,
                                 const std::string &extern_name) {
+    ZoneScoped;
     // Generate the function declaration and argument unpacking code.
     begin_func(f.linkage, simple_name, extern_name, f.args);
 
@@ -1133,6 +1140,7 @@ llvm::Type *CodeGen_LLVM::llvm_type_of(const Type &t) const {
 }
 
 void CodeGen_LLVM::optimize_module() {
+    ZoneScoped;
     debug(3) << "Optimizing module\n";
 
     auto time_start = std::chrono::high_resolution_clock::now();
@@ -3998,7 +4006,7 @@ void CodeGen_LLVM::visit(const For *op) {
         // Pop the loop variable from the scope
         sym_pop(op->name);
     } else {
-        internal_error << "Unknown type of For node. Only Serial and Parallel For nodes should survive down to codegen.\n";
+        internal_error << "Unknown type of For node: " << op->for_type << ". Only Serial and Parallel For nodes should survive down to codegen.\n";
     }
 }
 
diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
index 61fbb8602c10..785a2eecfd5b 100644
--- a/src/CodeGen_Metal_Dev.cpp
+++ b/src/CodeGen_Metal_Dev.cpp
@@ -70,6 +70,7 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
     public:
         CodeGen_Metal_C(std::ostream &s, const Target &t)
             : CodeGen_GPU_C(s, t) {
+            ZoneScoped;
             abs_returns_unsigned_type = false;
 
 #define alias(x, y)                         \
@@ -643,6 +644,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const FloatImm *op) {
 void CodeGen_Metal_Dev::add_kernel(Stmt s,
                                    const string &name,
                                    const vector<DeviceArgument> &args) {
+    ZoneScoped;
     debug(2) << "CodeGen_Metal_Dev::compile " << name << "\n";
 
     // We need to scalarize/de-predicate any loads/stores, since Metal does not
@@ -676,6 +678,7 @@ struct BufferSize {
 void CodeGen_Metal_Dev::CodeGen_Metal_C::add_kernel(const Stmt &s,
                                                     const string &name,
                                                     const vector<DeviceArgument> &args) {
+    ZoneScoped;
     debug(2) << "Adding Metal kernel " << name << "\n";
 
     // Figure out which arguments should be passed in constant.
@@ -825,6 +828,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::add_kernel(const Stmt &s,
 }
 
 void CodeGen_Metal_Dev::init_module() {
+    ZoneScoped;
     debug(2) << "Metal device codegen init_module\n";
 
     // wipe the internal kernel source
@@ -865,6 +869,7 @@ void CodeGen_Metal_Dev::init_module() {
 }
 
 vector<char> CodeGen_Metal_Dev::compile_to_src() {
+    ZoneScoped;
     string str = src_stream.str();
     debug(1) << "Metal kernel:\n"
              << str << "\n";
@@ -921,6 +926,7 @@ std::string CodeGen_Metal_Dev::print_gpu_name(const std::string &name) {
 }  // namespace
 
 std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Metal_Dev(const Target &target) {
+    ZoneScoped;
     return std::make_unique<CodeGen_Metal_Dev>(target);
 }
 
diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index 807d75444ed4..43d4e60c8a37 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -909,6 +909,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Atomic *op) {
 void CodeGen_OpenCL_Dev::add_kernel(Stmt s,
                                     const string &name,
                                     const vector<DeviceArgument> &args) {
+    ZoneScoped;
     debug(2) << "CodeGen_OpenCL_Dev::compile " << name << "\n";
 
     // We need to scalarize/de-predicate any loads/stores, since OpenCL does not
@@ -943,6 +944,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::add_kernel(Stmt s,
                                                       const string &name,
                                                       const vector<DeviceArgument> &args) {
 
+    ZoneScoped;
     debug(2) << "Adding OpenCL kernel " << name << "\n";
 
     debug(2) << "Eliminating bool vectors\n";
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 0786ade82847..087447eceb94 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -129,6 +129,7 @@ Type CodeGen_PTX_Dev::upgrade_type_for_storage(const Type &t) const {
 void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
                                  const std::string &name,
                                  const std::vector<DeviceArgument> &args) {
+    ZoneScoped;
     internal_assert(module != nullptr);
 
     debug(2) << "In CodeGen_PTX_Dev::add_kernel\n";
@@ -219,6 +220,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
 }
 
 void CodeGen_PTX_Dev::init_module() {
+    ZoneScoped;
     // This class uses multiple inheritance. It's a GPU device code generator,
     // and also an llvm-based one. Both of these track strict_float presence,
     // but OffloadGPULoops only sets the GPU device code generator flag, so here
diff --git a/src/CompilerProfiling.cpp b/src/CompilerProfiling.cpp
new file mode 100644
index 000000000000..06af2b7907b8
--- /dev/null
+++ b/src/CompilerProfiling.cpp
@@ -0,0 +1,193 @@
+#include "CompilerProfiling.h"
+
+#ifdef WITH_COMPILER_PROFILING
+
+#include "Util.h"
+
+#if !defined(_WIN32)
+#include <cxxabi.h>
+#else
+#include <windows.h>
+#pragma warning(disable : 4091)
+#include <dbghelp.h>
+#pragma comment(lib, "dbghelp.lib")
+#endif
+
+#include <fstream>
+#include <iomanip>
+#include <map>
+
+namespace Halide {
+namespace Internal {
+namespace Profiling {
+
+static std::string demangle(const char *name) {
+#if !defined(_WIN32)
+    int status = 0;
+    char *p = abi::__cxa_demangle(name, nullptr, nullptr, &status);
+    std::string ret(p);
+    free(p);
+    return ret;
+#else
+    char demangled_name[8192];
+    if (UnDecorateSymbolName(name, demangled_name, sizeof(demangled_name),
+                             UNDNAME_COMPLETE)) {
+        std::string ret(demangled_name);
+        return ret;
+    } else {
+        DWORD error = GetLastError();
+        std::cout << "UnDecorateSymbolName error: " << error << std::endl;
+        return name;
+    }
+#endif
+}
+
+static std::string cleanup_name(std::string name) {
+    debug(2) << "Cleaned  " << name << "   =>   ";
+    {
+        std::string_view profiled = "Halide::Internal::Profiling::Profiled<";
+        if (size_t idx = name.find(profiled); idx != std::string::npos) {
+            size_t cl = idx + 1;
+            int num_open = 1;
+            while (num_open != 0 && cl < name.size()) {
+                if (name[cl] == '<') {
+                    num_open++;
+                } else if (name[cl] == '>') {
+                    num_open--;
+                }
+                cl++;
+            }
+            name = name.substr(idx + profiled.size(), cl - idx - profiled.size() - 1);
+        }
+    }
+    name = replace_all(name, "Halide::Internal::", "");
+    name = replace_all(name, "(anonymous namespace)::", "");
+    debug(2) << name << "\n";
+    return name;
+}
+
+Context::Context() {
+    if (std::string file = get_env_variable("HL_COMPILER_TRACE_FILE"); !file.empty()) {
+        if (std::string bits = get_env_variable("HL_COMPILER_TRACE_BITS"); !bits.empty()) {
+            active_bits = BIT_GENERIC;
+            std::vector<std::string> sp = split_string(bits, ",");
+            for (const std::string &s : sp) {
+                if (s == "stmt") {
+                    active_bits |= BIT_STMT;
+                } else if (s == "expr") {
+                    active_bits |= BIT_EXPR;
+                }
+            }
+        } else {
+            active_bits = BIT_GENERIC;
+        }
+    }
+}
+Context::~Context() {
+    if (std::string file = get_env_variable("HL_COMPILER_TRACE_FILE"); !file.empty()) {
+        Profiling::write_halide_profiling_trace(file);
+    }
+}
+
+void write_halide_profiling_trace(const std::string &file) {
+    ZoneScoped;
+    std::lock_guard<std::mutex> lock(ctx.mutex);  // Ensure no threads are born while serializing
+
+    debug(1) << "Emitting trace.json events: " << file << "\n";
+
+    // Pass 1: Find the absolute global t=0 across all threads
+    uint64_t global_start_cycles = static_cast<uint64_t>(-1);
+    for (const auto &trace : ctx.traces) {
+        if (trace.events.empty()) {
+            continue;
+        }
+
+        uint64_t first_cycle = trace.start_cycles_64;
+        uint32_t first_timer = trace.events[0].timer;
+        uint32_t anchor_lower = static_cast<uint32_t>(first_cycle & 0xFFFFFFFF);
+
+        // Did the 32-bit timer roll over between thread creation and the first event?
+        if (first_timer < anchor_lower) {
+            first_cycle += 0x100000000ULL;
+        }
+        first_cycle = (first_cycle & 0xFFFFFFFF00000000ULL) | first_timer;
+
+        if (first_cycle < global_start_cycles) {
+            global_start_cycles = first_cycle;
+        }
+    }
+
+    std::map<const char *, std::string> demangled_names;
+
+    std::ofstream out(file);
+    out << "[\n";
+    bool first = true;
+
+    for (const auto &trace : ctx.traces) {
+        uint64_t current_cycles = trace.start_cycles_64;
+        uint32_t last_timer = static_cast<uint32_t>(current_cycles & 0xFFFFFFFF);
+
+        for (const auto &ev : trace.events) {
+            // Reconstruct the 64-bit timeline for this specific thread
+            if (ev.timer < last_timer) {
+                current_cycles += 0x1'0000'0000ULL;
+            }
+            current_cycles = (current_cycles & 0xFFFFFFFF00000000ULL) | ev.timer;
+            last_timer = ev.timer;
+
+            double ts_us = static_cast<double>(current_cycles - global_start_cycles) * 1e-3;
+
+            if (!first) {
+                out << ",\n";
+            }
+            first = false;
+
+            out << "  {";
+            if (ev.event_type == 0) {
+                const char *src_tag = ev.src_tag;
+                if (
+#if !defined(_WIN32)
+                    std::strncmp(src_tag, "N6Halide", 8) == 0
+#else
+                    std::strncmp(src_tag, "class Halide", 12) == 0 ||
+                    std::strncmp(src_tag, "struct Halide", 13) == 0
+#endif
+                ) {
+                    if (auto it = demangled_names.find(src_tag); it == demangled_names.end()) {
+                        std::string dn = demangle(src_tag);
+                        dn = cleanup_name(dn);
+                        auto result = demangled_names.emplace(src_tag, std::move(dn));
+                        src_tag = result.first->second.c_str();
+                    } else {
+                        src_tag = it->second.c_str();
+                    }
+                }
+                // Only add strings to the start event.
+                if (ev.tag == Event::Visitor) {
+                    const char *node_name = IRNodeType_string((IRNodeType)ev.data);
+                    out << "\"name\": \"" << node_name << "\", ";
+                    out << "\"cat\": \"" << src_tag << "\", ";
+                } else if (ev.tag == Event::Generic) {
+                    out << "\"name\": \"" << src_tag << "\", ";
+                    out << "\"args\": {";
+                    out << "\"data\": \"" << ev.data << "\"";
+                    out << "}, ";
+                }
+            }
+            out << "\"ph\": \"" << (ev.event_type == 0 ? "B" : "E") << "\", ";
+            out << "\"pid\": 1, ";
+            out << "\"tid\": " << trace.tid << ", ";  // Same TID forces perfectly nested flame-graph boxes
+            out << "\"ts\": " << std::fixed << std::setprecision(3) << ts_us;
+
+            out << "}\n";  // Let's put a newline to not kill bad parsers.
+        }
+    }
+
+    out << "\n]\n";
+}
+
+}  // namespace Profiling
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/CompilerProfiling.h b/src/CompilerProfiling.h
new file mode 100644
index 000000000000..228b98bce3bf
--- /dev/null
+++ b/src/CompilerProfiling.h
@@ -0,0 +1,245 @@
+#ifndef HALIDE_PROFILED_IR_VISITOR_H
+#define HALIDE_PROFILED_IR_VISITOR_H
+
+#ifdef WITH_COMPILER_PROFILING
+
+#include "IR.h"
+
+#include <chrono>
+#include <list>
+#include <mutex>
+#include <thread>
+
+#ifndef __FUNCTION_NAME__
+#ifdef WIN32  // WINDOWS
+#define __FUNCTION_NAME__ __FUNCTION__
+#else  //*NIX
+#define __FUNCTION_NAME__ __func__
+#endif
+#endif
+
+/** \file
+ * Defines the base class for things that recursively walk over the IR
+ */
+
+namespace Halide {
+namespace Internal {
+namespace Profiling {
+
+constexpr uint8_t BIT_GENERIC = 1 << 0;
+constexpr uint8_t BIT_STMT = 1 << 1;
+constexpr uint8_t BIT_EXPR = 1 << 2;
+
+inline uint64_t performance_counter() {
+    auto now = std::chrono::high_resolution_clock::now().time_since_epoch();
+    return static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(now).count());
+}
+
+struct Event {
+    const char *src_tag;
+    uint32_t timer;
+    uint8_t event_type : 1;  // 0 for Start, 1 for Stop
+
+    enum Tag : uint8_t {
+        Generic,
+        Visitor,
+    } tag : 1;
+
+    /** Optional for visitors and mutators. */
+    uint32_t data : 30;
+};
+static_assert(sizeof(Event) == sizeof(void *) + 8);
+
+struct ThreadTrace {
+    uint32_t tid;
+    uint64_t start_cycles_64;  // 64-bit anchor to align threads accurately
+    std::vector<Event> events;
+};
+
+struct Context {
+    std::mutex mutex;
+    uint8_t active_bits{0};
+
+    // std::list guarantees that pointers to elements are never invalidated
+    std::list<ThreadTrace> traces;
+
+    Context();
+    ~Context();
+};
+
+inline Context ctx;
+
+inline ThreadTrace &init_thread_profiler() {
+    std::lock_guard<std::mutex> lock(ctx.mutex);
+
+    // Hash thread ID to get a clean 32-bit integer for the Chrome UI
+    uint32_t tid = static_cast<uint32_t>(std::hash<std::thread::id>{}(std::this_thread::get_id()));
+    uint64_t anchor = performance_counter();
+
+    ctx.traces.push_back({tid, anchor, {}});
+    ThreadTrace &tt = ctx.traces.back();
+
+    // Pre-allocate ~16MB per thread to guarantee zero allocations during hot loops
+    tt.events.reserve(1'000'000);
+
+    return tt;
+}
+
+inline thread_local ThreadTrace &tls_thread_trace = init_thread_profiler();
+inline thread_local std::vector<Event> *tls_profiling_events = &tls_thread_trace.events;
+
+inline void zone_begin(const char *src_tag, Event::Tag tag, unsigned data = 0) {
+    Event pe;
+    pe.timer = static_cast<uint32_t>(performance_counter());
+    pe.tag = tag;
+    pe.src_tag = src_tag;
+    pe.event_type = 0;  // Start
+    pe.data = data;
+    tls_profiling_events->push_back(pe);
+}
+
+inline void zone_end(const char *src_tag, Event::Tag tag, unsigned data = 0) {
+    Event pe;
+    pe.timer = static_cast<uint32_t>(performance_counter());
+    pe.tag = tag;
+    pe.src_tag = src_tag;
+    pe.event_type = 1;  // Stop
+    pe.data = data;
+    tls_profiling_events->push_back(pe);
+}
+
+inline void generic_zone_begin(const char *src_tag, unsigned data = 0) {
+    if (ctx.active_bits & BIT_GENERIC) {
+        zone_begin(src_tag, Event::Tag::Generic, data);
+    }
+}
+
+inline void generic_zone_end(const char *src_tag, unsigned data = 0) {
+    if (ctx.active_bits & BIT_GENERIC) {
+        zone_end(src_tag, Event::Tag::Generic, data);
+    }
+}
+
+struct ZoneScopedVisitor_ {
+    IRNodeType node_type;
+    const char *src_tag;
+    bool active;
+
+    ZoneScopedVisitor_(IRNodeType node_type, const char *src_tag, uint8_t activation_bits)
+        : node_type(node_type), src_tag(src_tag), active(ctx.active_bits & activation_bits) {
+        if (active) {
+            zone_begin(src_tag, Event::Visitor, (unsigned)node_type);
+        }
+    }
+
+    ZoneScopedVisitor_(const Expr &e, const char *src_tag)
+        : node_type(e.defined() ? e->node_type : IRNodeType::IntImm),
+          src_tag(src_tag),
+          active((ctx.active_bits & BIT_EXPR) && e.defined()) {
+        if (active) {
+            zone_begin(src_tag, Event::Visitor, (unsigned)node_type);
+        }
+    }
+
+    ZoneScopedVisitor_(const Stmt &s, const char *src_tag)
+        : node_type(s.defined() ? s->node_type : IRNodeType::IntImm),
+          src_tag(src_tag),
+          active((ctx.active_bits & BIT_STMT) && s.defined()) {
+        if (active) {
+            zone_begin(src_tag, Event::Visitor, (unsigned)node_type);
+        }
+    }
+
+    ~ZoneScopedVisitor_() {
+        if (active) {
+            zone_end(src_tag, Event::Visitor, (unsigned)node_type);
+        }
+    }
+};
+
+struct ZoneScoped_ {
+    Event::Tag tag;
+    const char *src_tag;
+
+    ZoneScoped_(const char *src_tag, unsigned data = 0, Event::Tag tag = Event::Tag::Generic)
+        : tag(tag),
+          src_tag(src_tag) {
+        zone_begin(src_tag, tag, data);
+    }
+
+    ~ZoneScoped_() {
+        zone_end(src_tag, tag, 0);
+    }
+};
+
+#define ZoneScoped \
+    Halide::Internal::Profiling::ZoneScoped_ __zone_scoped(__FUNCTION_NAME__)
+#define ZoneScopedN(...) \
+    Halide::Internal::Profiling::ZoneScoped_ __zone_scoped(__VA_ARGS__)
+#define ZoneScopedVisitor(...) \
+    Halide::Internal::Profiling::ZoneScopedVisitor_ __zone_scoped(__VA_ARGS__)
+
+#ifdef HALIDE_ENABLE_RTTI
+#define HalideVisitorDynamicNameTag typeid(*this).name()
+#else
+#define HalideVisitorDynamicNameTag __FUNCTION__
+#endif
+
+template<typename Base>
+class Profiled : public Base {
+public:
+    using Base::Base;
+#ifdef HALIDE_ENABLE_RTTI
+    const char *tag = typeid(Base).name();
+#else
+    const char *tag = "Unknown (no RTTI)";
+#endif
+
+#define PROFILE_VISIT_STMT_OVERRIDE(T)                                    \
+    auto visit(const T *op) -> decltype(this->Base::visit(op)) override { \
+        ZoneScopedVisitor_ _prof(IRNodeType::T, tag, BIT_STMT);           \
+        return Base::visit(op);                                           \
+    }
+    HALIDE_FOR_EACH_IR_STMT(PROFILE_VISIT_STMT_OVERRIDE)
+
+#undef PROFILE_VISIT_STMT_OVERRIDE
+
+#define PROFILE_VISIT_EXPR_OVERRIDE(T)                                    \
+    auto visit(const T *op) -> decltype(this->Base::visit(op)) override { \
+        ZoneScopedVisitor_ _prof(IRNodeType::T, tag, BIT_EXPR);           \
+        return Base::visit(op);                                           \
+    }
+    HALIDE_FOR_EACH_IR_EXPR(PROFILE_VISIT_EXPR_OVERRIDE)
+
+#undef PROFILE_VISIT_EXPR_OVERRIDE
+};
+
+void write_halide_profiling_trace(const std::string &file);
+
+}  // namespace Profiling
+
+template<typename Base>
+using Profiled = Profiling::Profiled<Base>;
+
+}  // namespace Internal
+}  // namespace Halide
+
+#else
+
+namespace Profiling {
+inline void generic_zone_begin(const char *src_tag, unsigned data = 0) {
+}
+inline void generic_zone_end(const char *src_tag, unsigned data = 0) {
+}
+}  // namespace Profiling
+
+template<typename Base>
+using Profiled = Base;
+#define ZoneScoped
+#define ZoneScopedN(...)
+#define ZoneScopedVisitor(...)
+#define HalideVisitorDynamicNameTag
+
+#endif
+
+#endif  // HALIDE_PROFILED_IR_VISITOR_H
diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index ec85e5a383f9..a8d447bd391c 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -1603,6 +1603,7 @@ class NormalizeIfStatements : public IRMutator {
 }  // namespace
 
 Stmt fuse_gpu_thread_loops(Stmt s) {
+    ZoneScoped;
     // NormalizeIfStatements pushes the predicates between GPU blocks
     // into the innermost GPU block. FuseGPUThreadLoops would then
     // merge the predicate into the merged GPU thread.
diff --git a/src/IRMutator.h b/src/IRMutator.h
index 5a389fc4870d..ab502f3edade 100644
--- a/src/IRMutator.h
+++ b/src/IRMutator.h
@@ -9,6 +9,7 @@
 #include <type_traits>
 #include <utility>
 
+#include "CompilerProfiling.h"
 #include "IR.h"
 
 namespace Halide {
@@ -35,15 +36,18 @@ class IRMutator {
      * sub-statements.
      */
     inline Expr operator()(const Expr &expr) {
+        ZoneScopedN(HalideVisitorDynamicNameTag);
         return mutate(expr);
     }
 
     inline Stmt operator()(const Stmt &stmt) {
+        ZoneScopedN(HalideVisitorDynamicNameTag);
         return mutate(stmt);
     }
 
     // Like mutate_with_changes, but discard the changes flag.
     std::vector<Expr> operator()(const std::vector<Expr> &exprs) {
+        ZoneScopedN(HalideVisitorDynamicNameTag);
         return mutate_with_changes(exprs).first;
     }
 
@@ -92,9 +96,11 @@ class IRGraphMutator : public IRMutator {
 
 public:
     inline Expr operator()(const Expr &expr) {
+        ZoneScopedN(HalideVisitorDynamicNameTag);
         return mutate(expr);
     }
     inline Stmt operator()(const Stmt &stmt) {
+        ZoneScopedN(HalideVisitorDynamicNameTag);
         return mutate(stmt);
     }
 };
@@ -134,15 +140,17 @@ struct LambdaMutator final : IRMutator {
     }
 
 protected:
-#define HALIDE_CALL_VISIT_EXPR_IMPL(T) \
-    Expr visit(const T *op) override { \
-        return this->visit_impl(op);   \
+#define HALIDE_CALL_VISIT_EXPR_IMPL(T)                                          \
+    Expr visit(const T *op) override {                                          \
+        ZoneScopedVisitor(IRNodeType::T, "LambdaMutator", Profiling::BIT_EXPR); \
+        return this->visit_impl(op);                                            \
     }
     HALIDE_FOR_EACH_IR_EXPR(HALIDE_CALL_VISIT_EXPR_IMPL)
 #undef HALIDE_CALL_VISIT_EXPR_IMPL
-#define HALIDE_CALL_VISIT_STMT_IMPL(T) \
-    Stmt visit(const T *op) override { \
-        return this->visit_impl(op);   \
+#define HALIDE_CALL_VISIT_STMT_IMPL(T)                                          \
+    Stmt visit(const T *op) override {                                          \
+        ZoneScopedVisitor(IRNodeType::T, "LambdaMutator", Profiling::BIT_STMT); \
+        return this->visit_impl(op);                                            \
     }
     HALIDE_FOR_EACH_IR_STMT(HALIDE_CALL_VISIT_STMT_IMPL)
 #undef HALIDE_CALL_VISIT_STMT_IMPL
diff --git a/src/IROperator.cpp b/src/IROperator.cpp
index c729539daa29..bd5319944f26 100644
--- a/src/IROperator.cpp
+++ b/src/IROperator.cpp
@@ -1075,18 +1075,22 @@ T remove_intrinsics(const T &e, const std::initializer_list<Call::IntrinsicOp> &
 }  // namespace
 
 Expr remove_likelies(const Expr &e) {
+    ZoneScoped;
     return remove_intrinsics(e, {Call::likely, Call::likely_if_innermost});
 }
 
 Stmt remove_likelies(const Stmt &s) {
+    ZoneScoped;
     return remove_intrinsics(s, {Call::likely, Call::likely_if_innermost});
 }
 
 Expr remove_promises(const Expr &e) {
+    ZoneScoped;
     return remove_intrinsics(e, {Call::promise_clamped, Call::unsafe_promise_clamped});
 }
 
 Stmt remove_promises(const Stmt &s) {
+    ZoneScoped;
     return remove_intrinsics(s, {Call::promise_clamped, Call::unsafe_promise_clamped});
 }
 
diff --git a/src/IRVisitor.h b/src/IRVisitor.h
index 322f2bff6ba3..be0df5168e73 100644
--- a/src/IRVisitor.h
+++ b/src/IRVisitor.h
@@ -3,6 +3,7 @@
 
 #include <set>
 
+#include "CompilerProfiling.h"
 #include "IR.h"
 
 /** \file
@@ -22,15 +23,18 @@ class IRVisitor {
     virtual ~IRVisitor() = default;
 
     inline void operator()(const Stmt &s) {
+        ZoneScopedN(HalideVisitorDynamicNameTag);
         s.accept(this);
     }
 
     inline void operator()(const Expr &e) {
+        ZoneScopedN(HalideVisitorDynamicNameTag);
         e.accept(this);
     }
 
     template<typename T>
     inline void operator()(const T *op) {
+        ZoneScopedN(HalideVisitorDynamicNameTag);
         visit(op);
     }
 
@@ -120,9 +124,11 @@ void visit_with(const IRHandle &ir, Lambdas &&...lambdas) {
 class IRGraphVisitor : public IRVisitor {
 public:
     inline void operator()(const Expr &e) {
+        ZoneScopedN(HalideVisitorDynamicNameTag);
         include(e);
     }
     inline void operator()(const Stmt &s) {
+        ZoneScopedN(HalideVisitorDynamicNameTag);
         include(s);
     }
 
@@ -160,11 +166,20 @@ class IRGraphVisitor : public IRVisitor {
 template<typename T, typename ExprRet, typename StmtRet>
 class VariadicVisitor {
 private:
+#ifdef WITH_COMPILER_PROFILING
+#ifdef HALIDE_ENABLE_RTTI
+    const char *name = typeid(T).name();
+#else
+    const char *name = "VariadicVisitor";
+#endif
+#endif
+
     template<typename... Args>
     ExprRet dispatch_expr(const BaseExprNode *node, Args &&...args) {
         if (node == nullptr) {
             return ExprRet{};
         }
+        ZoneScopedVisitor(node->node_type, name, Profiling::BIT_EXPR);
         switch (node->node_type) {
 #define HALIDE_SWITCH_EXPR(NT) \
     case IRNodeType::NT:       \
@@ -183,6 +198,7 @@ class VariadicVisitor {
         if (node == nullptr) {
             return StmtRet{};
         }
+        ZoneScopedVisitor(node->node_type, name, Profiling::BIT_STMT);
         switch (node->node_type) {
 #define HALIDE_SWITCH_STMT(NT) \
     case IRNodeType::NT:       \
diff --git a/src/InjectHostDevBufferCopies.cpp b/src/InjectHostDevBufferCopies.cpp
index 6e8b21686657..c3b57b0c8435 100644
--- a/src/InjectHostDevBufferCopies.cpp
+++ b/src/InjectHostDevBufferCopies.cpp
@@ -1,6 +1,7 @@
 #include "InjectHostDevBufferCopies.h"
 
 #include "CodeGen_GPU_Dev.h"
+#include "CompilerProfiling.h"
 #include "Debug.h"
 #include "ExternFuncArgument.h"
 #include "IRMutator.h"
@@ -217,6 +218,7 @@ class InjectBufferCopiesForSingleBuffer : public IRMutator {
     }
 
     Stmt do_copies(Stmt s, FindBufferUsage *precomputed = nullptr) {
+        ZoneScoped;
         // Sniff what happens to the buffer inside the stmt
         FindBufferUsage local_finder(buffer, DeviceAPI::Host);
         if (!precomputed) {
@@ -570,6 +572,7 @@ class InjectBufferCopies : public IRMutator {
     };
 
     Stmt inject_free_after_last_use(Stmt body, const Stmt &last_use, const Stmt &free_stmt) {
+        ZoneScoped;
         bool success = false;
         body = mutate_with(
             body,
@@ -762,6 +765,7 @@ class InjectBufferCopiesForInputsAndOutputs : public IRMutator {
             finder(s);
             Stmt new_stmt = s;
             for (const string &buf : finder.result) {
+                ZoneScopedN("InjectBufferCopiesForSingleBuffer");
                 new_stmt = InjectBufferCopiesForSingleBuffer(buf, true, finder.result_storage.at(buf))(new_stmt);
             }
             return new_stmt;
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
index 4bdba15bdf25..02d8f293feeb 100644
--- a/src/JITModule.cpp
+++ b/src/JITModule.cpp
@@ -285,6 +285,7 @@ namespace {
 
 // Retrieve a function pointer from an llvm module, possibly by compiling it.
 JITModule::Symbol compile_and_get_function(llvm::orc::LLJIT &JIT, const string &name) {
+    ZoneScoped;
     debug(2) << "JIT Compiling " << name << "\n";
 
     auto addr = JIT.lookup(name);
@@ -341,6 +342,7 @@ JITModule::JITModule() {
 
 JITModule::JITModule(const Module &m, const LoweredFunc &fn,
                      const std::vector<JITModule> &dependencies) {
+    ZoneScoped;
     jit_module = new JITModuleContents();
     std::unique_ptr<llvm::Module> llvm_module(compile_module_to_llvm_module(m, *jit_module->context));
     std::vector<JITModule> deps_with_runtime = dependencies;
@@ -358,6 +360,7 @@ void compile_module_impl(
     std::unique_ptr<llvm::Module> m, const string &function_name, const Target &target,
     const std::vector<JITModule> &dependencies,
     const std::vector<std::string> &requested_exports) {
+    ZoneScoped;
 
     // Ensure that LLVM is initialized
     CodeGen_LLVM::initialize_llvm();
@@ -532,6 +535,7 @@ void compile_module_impl(
 void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &function_name, const Target &target,
                                const std::vector<JITModule> &dependencies,
                                const std::vector<std::string> &requested_exports) {
+    ZoneScoped;
     // LLJIT's SimpleCompiler triggers LLVM's AsmPrinter, which can use a large
     // amount of stack (observed stack overflows on macOS worker threads with
     // 512KB default stacks). Use run_with_large_stack to ensure enough space.
diff --git a/src/LICM.cpp b/src/LICM.cpp
index d0ee80aad177..a34c194cecb5 100644
--- a/src/LICM.cpp
+++ b/src/LICM.cpp
@@ -1,5 +1,6 @@
 #include "LICM.h"
 #include "CSE.h"
+#include "CompilerProfiling.h"
 #include "ExprUsesVar.h"
 #include "IREquality.h"
 #include "IRMutator.h"
@@ -525,6 +526,7 @@ class GroupLoopInvariants : public IRMutator {
 }  // namespace
 
 Stmt hoist_loop_invariant_values(Stmt s) {
+    ZoneScoped;
     s = GroupLoopInvariants()(s);
     s = common_subexpression_elimination(s);
     s = LICM()(s);
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 6eaf7496f178..6e65018035ef 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -1,4 +1,5 @@
 #include "LLVM_Runtime_Linker.h"
+#include "CompilerProfiling.h"
 #include "Error.h"
 #include "LLVM_Headers.h"
 #include "Target.h"
@@ -11,6 +12,7 @@ using std::vector;
 namespace {
 
 std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVMContext *context, const char *id) {
+    ZoneScoped;
 
     llvm::MemoryBufferRef bitcode_buffer = llvm::MemoryBufferRef(buf, id);
 
@@ -618,6 +620,7 @@ void convert_weak_to_linkonce(llvm::GlobalValue &gv) {
 // triple appropriately for the target.
 void link_modules(std::vector<std::unique_ptr<llvm::Module>> &modules, Target t,
                   bool allow_stripping_all_weak_functions = false) {
+    ZoneScoped;
     llvm::DataLayout data_layout = get_data_layout_for_target(t);
     llvm::Triple triple = Internal::get_triple_for_target(t);
 
@@ -846,6 +849,7 @@ namespace Internal {
 
 std::unique_ptr<llvm::Module> link_with_wasm_jit_runtime(llvm::LLVMContext *c, const Target &t,
                                                          std::unique_ptr<llvm::Module> extra_module) {
+    ZoneScoped;
     bool bits_64 = (t.bits == 64);
     bool debug = t.has_feature(Target::Debug);
 
@@ -889,6 +893,7 @@ std::unique_ptr<llvm::Module> link_with_wasm_jit_runtime(llvm::LLVMContext *c, c
 
 /** Create an llvm module containing the support code for a given target. */
 std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVMContext *c, bool for_shared_jit_runtime, bool just_gpu) {
+    ZoneScoped;
     enum InitialModuleType {
         ModuleAOT,
         ModuleAOTNoRuntime,
@@ -1381,6 +1386,7 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
 
 #ifdef WITH_NVPTX
 std::unique_ptr<llvm::Module> get_initial_module_for_ptx_device(Target target, llvm::LLVMContext *c) {
+    ZoneScoped;
     std::vector<std::unique_ptr<llvm::Module>> modules;
     modules.push_back(get_initmod_ptx_dev_ll(c));
 
@@ -1434,6 +1440,7 @@ std::unique_ptr<llvm::Module> get_initial_module_for_ptx_device(Target target, l
 
 void add_bitcode_to_module(llvm::LLVMContext *context, llvm::Module &module,
                            const std::vector<uint8_t> &bitcode, const std::string &name) {
+    ZoneScoped;
     llvm::StringRef sb = llvm::StringRef((const char *)&bitcode[0], bitcode.size());
     std::unique_ptr<llvm::Module> add_in = parse_bitcode_file(sb, context, name.c_str());
 
diff --git a/src/Lower.cpp b/src/Lower.cpp
index c248684ea5d9..ac34766fc9fb 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -21,6 +21,7 @@
 #include "CanonicalizeGPUVars.h"
 #include "ClampUnsafeAccesses.h"
 #include "CompilerLogger.h"
+#include "CompilerProfiling.h"
 #include "Debug.h"
 #include "DebugArguments.h"
 #include "DebugToFile.h"
@@ -94,29 +95,52 @@ namespace {
 class LoweringLogger {
     Stmt last_written;
     std::chrono::time_point<std::chrono::high_resolution_clock> last_time;
+    const char *last_msg;
+
     std::vector<std::pair<double, std::string>> timings;
     bool time_lowering_passes = false;
 
 public:
     LoweringLogger() {
-        last_time = std::chrono::high_resolution_clock::now();
         static bool should_time = !get_env_variable("HL_TIME_LOWERING_PASSES").empty();
         time_lowering_passes = should_time;
     }
 
-    void operator()(const string &message, const Stmt &s) {
+    void begin(const char *msg) {
+        debug(1) << "Lowering pass: " << msg << "...\n";
+        Profiling::generic_zone_begin(msg);
+        last_time = std::chrono::high_resolution_clock::now();
+        last_msg = msg;
+    }
+
+    void begin(const char *msg, int data) {
+        debug(1) << "Lowering pass: " << msg << " " << data << "...\n";
+        Profiling::generic_zone_begin(msg, data);
+        last_time = std::chrono::high_resolution_clock::now();
+        last_msg = msg;
+    }
+
+    void end() {
+        Profiling::generic_zone_end(last_msg);
         auto t = std::chrono::high_resolution_clock::now();
         std::chrono::duration<double> diff = t - last_time;
+        timings.emplace_back(diff.count() * 1000, last_msg);
+    }
+
+    void end(const Stmt &s) {
+        Profiling::generic_zone_end(last_msg);
+        auto t = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> diff = t - last_time;
+        timings.emplace_back(diff.count() * 1000, last_msg);
         if (!s.same_as(last_written)) {
-            debug(2) << message << "\n"
+            debug(2) << "Lowering after " << last_msg << "\n"
                      << s << "\n";
             last_written = s;
             last_time = t;
         } else {
-            debug(2) << message << " (unchanged)\n\n";
+            debug(2) << "Lowering after " << last_msg << " (unchanged)\n\n";
             last_time = t;
         }
-        timings.emplace_back(diff.count() * 1000, message);
     }
 
     ~LoweringLogger() {
@@ -143,6 +167,7 @@ void lower_impl(const vector<Function> &output_funcs,
                 bool trace_pipeline,
                 const vector<IRMutator *> &custom_passes,
                 Module &result_module) {
+    ZoneScoped;
     auto time_start = std::chrono::high_resolution_clock::now();
 
     size_t initial_lowered_function_count = result_module.functions().size();
@@ -173,293 +198,294 @@ void lower_impl(const vector<Function> &output_funcs,
 
     LoweringLogger log;
 
-    debug(1) << "Creating initial loop nests...\n";
+    log.begin("Creating initial loop nests");
     bool any_memoized = false;
     Stmt s = schedule_functions(outputs, fused_groups, env, t, any_memoized);
-    log("Lowering after creating initial loop nests:", s);
+    log.end(s);
 
     if (any_memoized) {
-        debug(1) << "Injecting memoization...\n";
+        log.begin("Injecting memoization");
         s = inject_memoization(s, env, pipeline_name, outputs);
-        log("Lowering after injecting memoization:", s);
+        log.end(s);
     } else {
         debug(1) << "Skipping injecting memoization...\n";
     }
-    debug(1) << "Injecting tracing...\n";
+    log.begin("Injecting tracing");
     s = inject_tracing(s, pipeline_name, trace_pipeline, env, outputs, t);
-    log("Lowering after injecting tracing:", s);
+    log.end(s);
 
-    debug(1) << "Adding checks for parameters\n";
+    log.begin("Adding checks for parameters");
     s = add_parameter_checks(requirements, s, t);
-    log("Lowering after injecting parameter checks:", s);
+    log.end(s);
 
     // Compute the maximum and minimum possible value of each
     // function. Used in later bounds inference passes.
-    debug(1) << "Computing bounds of each function's value\n";
+    log.begin("Computing bounds of each function's value");
     FuncValueBounds func_bounds = compute_function_value_bounds(order, env);
+    log.end();
 
     // Clamp unsafe instances where a Func f accesses a Func g using
     // an index which depends on a third Func h.
-    debug(1) << "Clamping unsafe data-dependent accesses\n";
+    log.begin("Clamping unsafe data-dependent accesses");
     s = clamp_unsafe_accesses(s, env, func_bounds);
-    log("Lowering after clamping unsafe data-dependent accesses", s);
+    log.end(s);
 
     // This pass injects nested definitions of variable names, so we
     // can't simplify statements from here until we fix them up. (We
     // can still simplify Exprs).
-    debug(1) << "Performing computation bounds inference...\n";
+    log.begin("Bounds inference");
     s = bounds_inference(s, outputs, order, fused_groups, env, func_bounds, t);
-    log("Lowering after computation bounds inference:", s);
+    log.end(s);
 
-    debug(1) << "Asserting that all split factors are positive...\n";
+    log.begin("Asserting that all split factors are positive");
     s = add_split_factor_checks(s, env);
-    log("Lowering after asserting that all split factors are positive:", s);
+    log.end(s);
 
-    debug(1) << "Removing extern loops...\n";
+    log.begin("Removing extern loops");
     s = remove_extern_loops(s);
-    log("Lowering after removing extern loops:", s);
+    log.end(s);
 
-    debug(1) << "Performing sliding window optimization...\n";
+    log.begin("Sliding window optimization");
     s = sliding_window(s, env);
-    log("Lowering after sliding window:", s);
+    log.end(s);
 
     // This uniquifies the variable names, so we're good to simplify
     // after this point. This lets later passes assume syntactic
     // equivalence means semantic equivalence.
-    debug(1) << "Uniquifying variable names...\n";
+    log.begin("Uniquifying variable names");
     s = uniquify_variable_names(s);
-    log("Lowering after uniquifying variable names:", s);
+    log.end(s);
 
-    debug(1) << "Simplifying...\n";
+    log.begin("Simplifying");
     s = simplify(s);
-    log("Lowering after first simplification:", s);
+    log.end(s);
 
-    debug(1) << "Simplifying correlated differences...\n";
+    log.begin("Simplifying correlated differences");
     s = simplify_correlated_differences(s);
-    log("Lowering after simplifying correlated differences:", s);
+    log.end(s);
 
-    debug(1) << "Performing allocation bounds inference...\n";
+    log.begin("Allocation bounds inference");
     s = allocation_bounds_inference(s, env, func_bounds);
-    log("Lowering after allocation bounds inference:", s);
+    log.end(s);
 
     bool will_inject_host_copies =
         (t.has_gpu_feature() ||
          t.has_feature(Target::HexagonDma) ||
          (t.arch != Target::Hexagon && (t.has_feature(Target::HVX))));
 
-    debug(1) << "Adding checks for images\n";
+    log.begin("Adding checks for images");
     s = add_image_checks(s, outputs, t, order, env, func_bounds, will_inject_host_copies);
-    log("Lowering after injecting image checks:", s);
+    log.end(s);
 
-    debug(1) << "Removing code that depends on undef values...\n";
+    log.begin("Removing code that depends on undef values");
     s = remove_undef(s);
-    log("Lowering after removing code that depends on undef values:", s);
+    log.end(s);
 
-    debug(1) << "Performing storage folding optimization...\n";
+    log.begin("Performing storage folding optimization");
     s = storage_folding(s, env);
-    log("Lowering after storage folding:", s);
+    log.end(s);
 
-    debug(1) << "Injecting debug_to_file calls...\n";
+    log.begin("Injecting debug_to_file calls");
     s = debug_to_file(s, outputs, env);
-    log("Lowering after injecting debug_to_file calls:", s);
+    log.end(s);
 
-    debug(1) << "Injecting prefetches...\n";
+    log.begin("Injecting prefetches");
     s = inject_prefetch(s, env);
-    log("Lowering after injecting prefetches:", s);
+    log.end(s);
 
-    debug(1) << "Discarding safe promises...\n";
+    log.begin("Discarding safe promises");
     s = lower_safe_promises(s);
-    log("Lowering after discarding safe promises:", s);
+    log.end(s);
 
-    debug(1) << "Dynamically skipping stages...\n";
+    log.begin("Dynamically skipping stages");
     s = skip_stages(s, outputs, fused_groups, env);
-    log("Lowering after dynamically skipping stages:", s);
+    log.end(s);
 
-    debug(1) << "Forking asynchronous producers...\n";
+    log.begin("Forking asynchronous producers");
     s = fork_async_producers(s, env);
-    log("Lowering after forking asynchronous producers:", s);
+    log.end(s);
 
-    debug(1) << "Destructuring tuple-valued realizations...\n";
+    log.begin("Destructuring tuple-valued realizations");
     s = split_tuples(s, env);
-    log("Lowering after destructuring tuple-valued realizations:", s);
+    log.end(s);
 
     if (t.has_gpu_feature()) {
-        debug(1) << "Canonicalizing GPU var names...\n";
+        log.begin("Canonicalizing GPU var names");
         s = canonicalize_gpu_vars(s);
-        log("Lowering after canonicalizing GPU var names:", s);
+        log.end(s);
     }
 
-    debug(1) << "Bounding small realizations...\n";
+    log.begin("Bounding small realizations");
     s = simplify_correlated_differences(s);
     s = bound_small_allocations(s);
-    log("Lowering after bounding small realizations:", s);
+    log.end(s);
 
-    debug(1) << "Performing storage flattening...\n";
+    log.begin("Performing storage flattening");
     s = storage_flattening(s, outputs, env, t);
-    log("Lowering after storage flattening:", s);
+    log.end(s);
 
-    debug(1) << "Adding atomic mutex allocation...\n";
+    log.begin("Adding atomic mutex allocation");
     s = add_atomic_mutex(s, outputs);
-    log("Lowering after adding atomic mutex allocation:", s);
+    log.end(s);
 
-    debug(1) << "Unpacking buffer arguments...\n";
+    log.begin("Unpacking buffer arguments");
     s = unpack_buffers(s);
-    log("Lowering after unpacking buffer arguments:", s);
+    log.end(s);
 
     if (any_memoized) {
-        debug(1) << "Rewriting memoized allocations...\n";
+        log.begin("Rewriting memoized allocations");
         s = rewrite_memoized_allocations(s, env);
-        log("Lowering after rewriting memoized allocations:", s);
+        log.end(s);
     } else {
-        debug(1) << "Skipping rewriting memoized allocations...\n";
+        debug(1) << "Skipping rewriting memoized allocations\n";
     }
 
     if (will_inject_host_copies) {
-        debug(1) << "Selecting a GPU API for GPU loops...\n";
+        log.begin("Selecting a GPU API for GPU loops");
         s = select_gpu_api(s, t);
-        log("Lowering after selecting a GPU API:", s);
+        log.end(s);
 
-        debug(1) << "Injecting host <-> dev buffer copies...\n";
+        log.begin("Injecting host <-> dev buffer copies");
         s = inject_host_dev_buffer_copies(s, t);
-        log("Lowering after injecting host <-> dev buffer copies:", s);
+        log.end(s);
 
-        debug(1) << "Selecting a GPU API for extern stages...\n";
+        log.begin("Selecting a GPU API for extern stages");
         s = select_gpu_api(s, t);
-        log("Lowering after selecting a GPU API for extern stages:", s);
+        log.end(s);
     } else {
-        debug(1) << "Injecting host-dirty marking...\n";
+        log.begin("Injecting host-dirty marking");
         s = inject_host_dev_buffer_copies(s, t);
-        log("Lowering after injecting host-dirty marking:", s);
+        log.end(s);
     }
 
-    debug(1) << "Simplifying...\n";
+    log.begin("Simplifying");
     s = simplify(s);
     s = unify_duplicate_lets(s);
-    log("Lowering after second simplification:", s);
+    log.end(s);
 
-    debug(1) << "Reduce prefetch dimension...\n";
+    log.begin("Reduce prefetch dimension");
     s = reduce_prefetch_dimension(s, t);
-    log("Lowering after reduce prefetch dimension:", s);
+    log.end(s);
 
-    debug(1) << "Simplifying correlated differences...\n";
+    log.begin("Simplifying correlated differences");
     s = simplify_correlated_differences(s);
-    log("Lowering after simplifying correlated differences:", s);
+    log.end(s);
 
-    debug(1) << "Bounding constant extent loops...\n";
+    log.begin("Bounding constant extent loops");
     s = bound_constant_extent_loops(s);
-    log("Lowering after bounding constant extent loops:", s);
+    log.end(s);
 
-    debug(1) << "Unrolling...\n";
+    log.begin("Unrolling");
     s = unroll_loops(s);
-    log("Lowering after unrolling:", s);
+    log.end(s);
 
-    debug(1) << "Vectorizing...\n";
+    log.begin("Vectorizing");
     s = vectorize_loops(s, env);
     s = simplify(s);
-    log("Lowering after vectorizing:", s);
+    log.end(s);
 
     if (t.has_gpu_feature() ||
         t.has_feature(Target::Vulkan)) {
-        debug(1) << "Injecting per-block gpu synchronization...\n";
+        log.begin("Injecting per-block gpu synchronization");
         s = fuse_gpu_thread_loops(s);
-        log("Lowering after injecting per-block gpu synchronization:", s);
+        log.end(s);
     }
 
-    debug(1) << "Detecting vector interleavings...\n";
+    log.begin("Detecting vector interleavings");
     s = rewrite_interleavings(s);
     s = simplify(s);
-    log("Lowering after rewriting vector interleavings:", s);
+    log.end(s);
 
-    debug(1) << "Partitioning loops to simplify boundary conditions...\n";
+    log.begin("Partitioning loops to simplify boundary conditions");
     s = partition_loops(s);
     s = simplify(s);
-    log("Lowering after partitioning loops:", s);
+    log.end(s);
 
-    debug(1) << "Staging strided loads...\n";
+    log.begin("Staging strided loads");
     s = stage_strided_loads(s, t);
-    log("Lowering after staging strided loads:", s);
+    log.end(s);
 
-    debug(1) << "Trimming loops to the region over which they do something...\n";
+    log.begin("Trimming loops to the region over which they do something");
     s = trim_no_ops(s);
-    log("Lowering after loop trimming:", s);
+    log.end(s);
 
-    debug(1) << "Rebasing loops to zero...\n";
+    log.begin("Rebasing loops to zero");
     s = rebase_loops_to_zero(s);
-    debug(2) << "Lowering after rebasing loops to zero:\n"
-             << s << "\n\n";
+    log.end(s);
 
-    debug(1) << "Hoisting loop invariant if statements...\n";
+    log.begin("Hoisting loop invariant if statements");
     s = hoist_loop_invariant_if_statements(s);
-    log("Lowering after hoisting loop invariant if statements:", s);
+    log.end(s);
 
-    debug(1) << "Injecting early frees...\n";
+    log.begin("Injecting early frees");
     s = inject_early_frees(s);
-    log("Lowering after injecting early frees:", s);
+    log.end(s);
 
     if (t.has_feature(Target::FuzzFloatStores)) {
-        debug(1) << "Fuzzing floating point stores...\n";
+        log.begin("Fuzzing floating point stores");
         s = fuzz_float_stores(s);
-        log("Lowering after fuzzing floating point stores:", s);
+        log.end(s);
     }
 
-    debug(1) << "Simplifying correlated differences...\n";
+    log.begin("Simplifying correlated differences");
     s = simplify_correlated_differences(s);
-    log("Lowering after simplifying correlated differences:", s);
+    log.end(s);
 
-    debug(1) << "Bounding small allocations...\n";
+    log.begin("Bounding small allocations");
     s = bound_small_allocations(s);
-    log("Lowering after bounding small allocations:", s);
+    log.end(s);
 
     if (t.has_feature(Target::Profile) || t.has_feature(Target::ProfileByTimer)) {
-        debug(1) << "Injecting profiling...\n";
+        log.begin("Injecting profiling");
         s = inject_profiling(s, pipeline_name, env);
-        log("Lowering after injecting profiling:", s);
+        log.end(s);
     }
 
     if (t.has_feature(Target::CUDA)) {
-        debug(1) << "Injecting warp shuffles...\n";
+        log.begin("Injecting warp shuffles");
         s = lower_warp_shuffles(s, t);
-        log("Lowering after injecting warp shuffles:", s);
+        log.end(s);
     }
 
-    debug(1) << "Simplifying...\n";
+    log.begin("Simplifying");
     s = common_subexpression_elimination(s);
+    log.end();
 
-    debug(1) << "Lowering unsafe promises...\n";
+    log.begin("Lowering unsafe promises");
     s = lower_unsafe_promises(s, t);
-    log("Lowering after lowering unsafe promises:", s);
+    log.end(s);
 
     if (t.has_feature(Target::AVX512_SapphireRapids)) {
-        debug(1) << "Extracting tile operations...\n";
+        log.begin("Extracting tile operations");
         s = extract_tile_operations(s);
-        log("Lowering after extracting tile operations:", s);
+        log.end(s);
     }
 
-    debug(1) << "Flattening nested ramps...\n";
+    log.begin("Flattening nested ramps");
     s = flatten_nested_ramps(s);
-    log("Lowering after flattening nested ramps:", s);
+    log.end(s);
 
-    debug(1) << "Removing dead allocations and moving loop invariant code...\n";
+    log.begin("Removing dead allocations and moving loop invariant code");
     s = remove_dead_allocations(s);
     s = simplify(s);
     s = hoist_loop_invariant_values(s);
     s = hoist_loop_invariant_if_statements(s);
-    log("Lowering after removing dead allocations and hoisting loop invariants:", s);
+    log.end(s);
 
-    debug(1) << "Finding intrinsics...\n";
+    log.begin("Finding intrinsics");
     // Must be run after the last simplification, because it turns
     // divisions into shifts, which the simplifier reverses.
     s = find_intrinsics(s);
-    log("Lowering after finding intrinsics:", s);
+    log.end(s);
 
-    debug(1) << "Hoisting prefetches...\n";
+    log.begin("Hoisting prefetches");
     s = hoist_prefetches(s);
-    log("Lowering after hoisting prefetches:", s);
+    log.end(s);
 
     if (t.has_feature(Target::NoAsserts)) {
-        debug(1) << "Stripping asserts...\n";
+        log.begin("Stripping asserts");
         s = strip_asserts(s);
-        log("Lowering after stripping asserts:", s);
+        log.end(s);
     }
 
     debug(1) << "Lowering after final simplification:\n"
@@ -467,10 +493,9 @@ void lower_impl(const vector<Function> &output_funcs,
 
     if (!custom_passes.empty()) {
         for (size_t i = 0; i < custom_passes.size(); i++) {
-            debug(1) << "Running custom lowering pass " << i << "...\n";
-            s = (*custom_passes[i])(s);
-            debug(1) << "Lowering after custom pass " << i << ":\n"
-                     << s << "\n\n";
+            log.begin("Custom lowering pass", i);
+            s = custom_passes[i]->operator()(s);
+            log.end(s);
         }
     }
 
@@ -478,19 +503,17 @@ void lower_impl(const vector<Function> &output_funcs,
     result_module.set_conceptual_code_stmt(s);
 
     if (t.arch != Target::Hexagon && t.has_feature(Target::HVX)) {
-        debug(1) << "Splitting off Hexagon offload...\n";
+        log.begin("Splitting off Hexagon offload");
         s = inject_hexagon_rpc(s, t, result_module);
-        debug(2) << "Lowering after splitting off Hexagon offload:\n"
-                 << s << "\n";
+        log.end(s);
     } else {
         debug(1) << "Skipping Hexagon offload...\n";
     }
 
     if (t.has_gpu_feature()) {
-        debug(1) << "Offloading GPU loops...\n";
+        log.begin("Offloading GPU loops");
         s = inject_gpu_offload(s, t, any_strict_float);
-        debug(2) << "Lowering after splitting off GPU loops:\n"
-                 << s << "\n\n";
+        log.end(s);
     } else {
         debug(1) << "Skipping GPU offload...\n";
     }
@@ -505,7 +528,7 @@ void lower_impl(const vector<Function> &output_funcs,
     vector<InferredArgument> inferred_args = infer_arguments(s, outputs);
 
     std::vector<LoweredFunc> closure_implementations;
-    debug(1) << "Lowering Parallel Tasks...\n";
+    log.begin("Lowering Parallel Tasks");
     s = lower_parallel_tasks(s, closure_implementations, pipeline_name, t);
     // Process any LoweredFunctions added by other passes. In practice, this
     // will likely not work well enough due to ordering issues with
@@ -520,8 +543,7 @@ void lower_impl(const vector<Function> &output_funcs,
     for (auto &lowered_func : closure_implementations) {
         result_module.append(lowered_func);
     }
-    debug(2) << "Lowering after generating parallel tasks and closures:\n"
-             << s << "\n\n";
+    log.end(s);
 
     vector<Argument> public_args = args;
     for (const auto &out : outputs) {
diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index 7351a038e019..85364cc345b5 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -9,6 +9,7 @@
 #include "CodeGen_PTX_Dev.h"
 #include "CodeGen_Vulkan_Dev.h"
 #include "CodeGen_WebGPU_Dev.h"
+#include "CompilerProfiling.h"
 #include "ExprUsesVar.h"
 #include "IRMutator.h"
 #include "IROperator.h"
@@ -248,6 +249,7 @@ class InjectGpuOffload : public IRMutator {
 public:
     InjectGpuOffload(const Target &target, bool any_strict_float)
         : target(target) {
+        ZoneScoped;
         Target device_target = target;
         // For the GPU target we just want to pass the flags, to avoid the
         // generated kernel code unintentionally having any dependence on the
@@ -281,12 +283,13 @@ class InjectGpuOffload : public IRMutator {
     }
 
     Stmt inject(const Stmt &s) {
+        ZoneScoped;
         // Create a new module for all of the kernels we find in this function.
         for (auto &i : cgdev) {
             i.second->init_module();
         }
 
-        Stmt result = mutate(s);
+        Stmt result = (*this)(s);
 
         for (auto &i : cgdev) {
             string api_unique_name = i.second->api_unique_name();
diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp
index f4dca6ee3292..7f3fd7ceef9b 100644
--- a/src/Pipeline.cpp
+++ b/src/Pipeline.cpp
@@ -4,6 +4,7 @@
 #include "Argument.h"
 #include "Callable.h"
 #include "CodeGen_Internal.h"
+#include "CompilerProfiling.h"
 #include "Deserialization.h"
 #include "FindCalls.h"
 #include "Func.h"
@@ -495,6 +496,7 @@ Module Pipeline::compile_to_module(const vector<Argument> &args,
                                    const string &fn_name,
                                    const Target &target,
                                    const LinkageType linkage_type) {
+    ZoneScoped;
     user_assert(defined()) << "Can't compile undefined Pipeline.\n";
 
     for (const Function &f : contents->outputs) {
@@ -580,6 +582,7 @@ Target Pipeline::get_compiled_jit_target() const {
 }
 
 void Pipeline::compile_jit(const Target &target_arg) {
+    ZoneScoped;
     user_assert(defined()) << "Pipeline is undefined\n";
 
     Target target = target_arg;
@@ -645,6 +648,7 @@ void Pipeline::compile_jit(const Target &target_arg) {
 }
 
 Callable Pipeline::compile_to_callable(const std::vector<Argument> &args_in, const Target &target_arg) {
+    ZoneScoped;
     user_assert(defined()) << "Pipeline is undefined\n";
 
     Target target = target_arg.with_feature(Target::JIT).with_feature(Target::UserContext);
@@ -676,6 +680,7 @@ Callable Pipeline::compile_to_callable(const std::vector<Argument> &args_in, con
                                                 const std::vector<Internal::Function> &outputs,
                                                 const std::map<std::string, JITExtern> &jit_externs_in,
                                                 const Target &target_arg) {
+    ZoneScoped;
     user_assert(!target_arg.has_unknowns()) << "Cannot jit-compile for target '" << target_arg << "'\n";
 
     Target jit_target = target_arg.with_feature(Target::JIT).with_feature(Target::UserContext);
diff --git a/src/Prefetch.cpp b/src/Prefetch.cpp
index 6ab41b3deade..d0d27d9e47be 100644
--- a/src/Prefetch.cpp
+++ b/src/Prefetch.cpp
@@ -445,12 +445,14 @@ Stmt inject_placeholder_prefetch(const Stmt &s, const map<string, Function> &env
 }
 
 Stmt inject_prefetch(const Stmt &s, const map<string, Function> &env) {
+    ZoneScoped;
     CollectExternalBufferBounds finder;
     s.accept(&finder);
     return InjectPrefetch(env, finder.buffers)(s);
 }
 
 Stmt reduce_prefetch_dimension(Stmt stmt, const Target &t) {
+    ZoneScoped;
     size_t max_dim = 0;
     Expr max_byte_size;
 
diff --git a/src/Simplify.cpp b/src/Simplify.cpp
index a7d7d7f6faa0..d1ac6d828efb 100644
--- a/src/Simplify.cpp
+++ b/src/Simplify.cpp
@@ -3,6 +3,7 @@
 
 #include "CSE.h"
 #include "CompilerLogger.h"
+#include "CompilerProfiling.h"
 #include "IRMutator.h"
 #include "Substitute.h"
 
@@ -16,6 +17,7 @@ using std::string;
 using std::vector;
 
 Simplify::Simplify(const Scope<Interval> *bi, const Scope<ModulusRemainder> *ai) {
+    ZoneScoped;
     // Only respect the constant bounds from the containing scope.
     for (auto iter = bi->cbegin(); iter != bi->cend(); ++iter) {
         ExprInfo info;
@@ -323,6 +325,7 @@ template<typename T>
 T substitute_facts_impl(const T &t,
                         const std::set<Expr, IRDeepCompare> &truths,
                         const std::set<Expr, IRDeepCompare> &falsehoods) {
+    ZoneScoped;
     return mutate_with(t, [&](auto *self, const Expr &e) {
         if (e.type().is_bool()) {
             if (truths.count(e)) {
@@ -363,6 +366,7 @@ Expr simplify(const Expr &e,
               const Scope<Interval> &bounds,
               const Scope<ModulusRemainder> &alignment,
               const std::vector<Expr> &assumptions) {
+    ZoneScoped;
     Simplify m(&bounds, &alignment);
     std::vector<Simplify::ScopedFact> facts;
     facts.reserve(assumptions.size());
@@ -380,6 +384,7 @@ Stmt simplify(const Stmt &s,
               const Scope<Interval> &bounds,
               const Scope<ModulusRemainder> &alignment,
               const std::vector<Expr> &assumptions) {
+    ZoneScoped;
     Simplify m(&bounds, &alignment);
     std::vector<Simplify::ScopedFact> facts;
     facts.reserve(assumptions.size());
@@ -402,10 +407,12 @@ class SimplifyExprs : public IRMutator {
 };
 
 Stmt simplify_exprs(const Stmt &s) {
+    ZoneScoped;
     return SimplifyExprs().mutate(s);
 }
 
 bool can_prove(Expr e, const Scope<Interval> &bounds) {
+    ZoneScoped;
     internal_assert(e.type().is_bool())
         << "Argument to can_prove is not a boolean Expr: " << e << "\n";
 
diff --git a/src/Solve.cpp b/src/Solve.cpp
index 204ec7eab366..67ea54a1c1b0 100644
--- a/src/Solve.cpp
+++ b/src/Solve.cpp
@@ -1210,6 +1210,7 @@ class SolveForInterval : public IRVisitor {
 }  // Anonymous namespace
 
 SolverResult solve_expression(const Expr &e, const std::string &variable, const Scope<Expr> &scope) {
+    ZoneScoped;
     SolveExpression solver(variable, scope);
     Expr new_e = solver(e);
     // The process has expanded lets. Re-collect them.
diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index da14d1b52530..0e5dd17c1fa4 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -626,6 +626,7 @@ Stmt storage_flattening(Stmt s,
                         const vector<Function> &outputs,
                         const map<string, Function> &env,
                         const Target &target) {
+    ZoneScoped;
     s = zero_gpu_loop_mins(s);
 
     // Make an environment that makes it easier to figure out which
diff --git a/src/Substitute.cpp b/src/Substitute.cpp
index 3781914944cf..df71101d0b07 100644
--- a/src/Substitute.cpp
+++ b/src/Substitute.cpp
@@ -140,10 +140,12 @@ auto substitute_impl(const Expr &find, const Expr &replacement, const T &ir) {
 }  // namespace
 
 Expr substitute(const Expr &find, const Expr &replacement, const Expr &expr) {
+    ZoneScoped;
     return substitute_impl(find, replacement, expr);
 }
 
 Stmt substitute(const Expr &find, const Expr &replacement, const Stmt &stmt) {
+    ZoneScoped;
     return substitute_impl(find, replacement, stmt);
 }
 
@@ -204,18 +206,22 @@ class GraphSubstituteExpr : public IRGraphMutator {
 }  // namespace
 
 Expr graph_substitute(const string &name, const Expr &replacement, const Expr &expr) {
+    ZoneScoped;
     return GraphSubstitute(name, replacement)(expr);
 }
 
 Stmt graph_substitute(const string &name, const Expr &replacement, const Stmt &stmt) {
+    ZoneScoped;
     return GraphSubstitute(name, replacement)(stmt);
 }
 
 Expr graph_substitute(const Expr &find, const Expr &replacement, const Expr &expr) {
+    ZoneScoped;
     return GraphSubstituteExpr(find, replacement)(expr);
 }
 
 Stmt graph_substitute(const Expr &find, const Expr &replacement, const Stmt &stmt) {
+    ZoneScoped;
     return GraphSubstituteExpr(find, replacement)(stmt);
 }
 
@@ -235,10 +241,12 @@ class SubstituteInAllLets : public IRGraphMutator {
 }  // namespace
 
 Expr substitute_in_all_lets(const Expr &expr) {
+    ZoneScoped;
     return SubstituteInAllLets()(expr);
 }
 
 Stmt substitute_in_all_lets(const Stmt &stmt) {
+    ZoneScoped;
     return SubstituteInAllLets()(stmt);
 }
 
diff --git a/src/UniquifyVariableNames.cpp b/src/UniquifyVariableNames.cpp
index 6eec453afc63..2c6cc2e60178 100644
--- a/src/UniquifyVariableNames.cpp
+++ b/src/UniquifyVariableNames.cpp
@@ -165,6 +165,7 @@ class FindFreeVars : public IRVisitor {
 }  // namespace
 
 Stmt uniquify_variable_names(const Stmt &s) {
+    ZoneScoped;
     FindFreeVars finder;
     s.accept(&finder);
     return UniquifyVariableNames(&finder.free_vars)(s);
diff --git a/src/UnrollLoops.cpp b/src/UnrollLoops.cpp
index 127dac785dea..48ec31bb655b 100644
--- a/src/UnrollLoops.cpp
+++ b/src/UnrollLoops.cpp
@@ -1,4 +1,5 @@
 #include "UnrollLoops.h"
+#include "CompilerProfiling.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "Simplify.h"
@@ -16,6 +17,7 @@ class UnrollLoops : public IRMutator {
 
     Stmt visit(const For *for_loop) override {
         if (for_loop->for_type == ForType::Unrolled) {
+            ZoneScopedN("UnrollForLoop");
             Stmt body = for_loop->body;
             Expr extent = simplify(for_loop->extent());
             const IntImm *e = extent.as<IntImm>();
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index f97099287003..059ca0f8c8d6 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -1663,6 +1663,7 @@ class AllStoresInScope : public IRVisitor {
     }
 };
 bool all_stores_in_scope(const Stmt &stmt, const Scope<> &scope) {
+    ZoneScoped;
     AllStoresInScope checker(scope);
     stmt.accept(&checker);
     return checker.result;
@@ -1711,6 +1712,7 @@ Stmt vectorize_statement(const Stmt &stmt) {
 
 }  // namespace
 Stmt vectorize_loops(const Stmt &stmt, const map<string, Function> &env) {
+    ZoneScoped;
     // Limit the scope of atomic nodes to just the necessary stuff.
     // TODO: Should this be an earlier pass? It's probably a good idea
     // for non-vectorizing stuff too.