hw-native-sys · zmnobug · Jun 17, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -301,11 +301,11 @@ jobs:
 
   # ---------- Profiling sub-flags smoke (compile + run) ----------
   # PTO2_PROFILING / PTO2_ORCH_PROFILING / PTO2_SCHED_PROFILING /
-  # PTO2_TENSORMAP_PROFILING are compile-time gates in src/{a2a3,a5}/runtime/
-  # tensormap_and_ringbuffer/runtime/pto_runtime2_types.h. The defaults
-  # (PTO2_PROFILING=1, sub-flags=0) are exercised by every CI job, but the
-  # non-default branches are dead coverage today — a developer flipping any
-  # of them for perf debugging or to minimize logging overhead had no gate
+  # PTO2_TENSORMAP_PROFILING are compile-time gates whose defaults and
+  # dependency checks live in src/common/task_interface/profiling_config.h.
+  # The defaults (PTO2_PROFILING=1, sub-flags=0) are exercised by every CI job,
+  # but non-default branches are otherwise dead coverage: a developer flipping
+  # any of them for perf debugging or to minimize logging overhead had no gate
   # protecting the gated code from drift (renamed fields, changed signatures,
   # format-string mismatches in LOG_INFO summaries, dead parameters caught
   # only by -Wunused-parameter -Werror).

diff --git a/docs/dfx/l2-timing.md b/docs/dfx/l2-timing.md
@@ -2,8 +2,9 @@
 
 For an L2 run you usually look at a handful of timing numbers. They come from
 **two channels**, both available with no extra flags because they ride on the
-compile-time `PTO2_PROFILING` macro (default `1`, already in the prebuilt
-runtimes):
+compile-time `PTO2_PROFILING` macro (default `1` in
+`src/common/task_interface/profiling_config.h`, already enabled in the
+prebuilt runtimes):
 
 1. **`RunTiming`** — `host_wall` and `device_wall`, returned directly by
    `Worker.run()` and printed per round by the harness.

diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md
@@ -324,6 +324,11 @@ What you can read out of `tensor_dump.json` + `tensor_dump.bin`:
 
 ## 5. Design Highlights
 
+`Arg::dump(...)` selection state is compiled only when
+`PTO2_PROFILING=1`. With `PTO2_PROFILING=0`, the public API remains
+available but acts as a no-op: no dump-only `Arg` state is stored and
+submit does not propagate dump metadata.
+
 ### 5.1 Common device-side structures
 
 Both architectures share the same device-side layout, published via

diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
@@ -12,13 +12,7 @@
 #ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
 #define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
 
-// =============================================================================
-// Profiling Configuration
-// =============================================================================
-
-#ifndef PTO2_PROFILING
-#define PTO2_PROFILING 1
-#endif
+#include "profiling_config.h"
 
 // =============================================================================
 // Tensor Dump Configuration

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -8,6 +8,11 @@ PTO Runtime2 uses a hierarchical profiling system with compile-time macros to co
 
 ## Profiling Macro Hierarchy
 
+Defaults and dependency validation are centralized in
+`src/common/task_interface/profiling_config.h`. Runtime headers include that
+file before using the macros, so both a2a3 and a5 share the same default
+values and compile-time checks.
+
 ```text
 PTO2_PROFILING (base level, default=1)
 ├── PTO2_ORCH_PROFILING (orchestrator, default=0, requires PTO2_PROFILING=1)
@@ -385,14 +390,24 @@ PTO2_TENSORMAP_PROFILING=1
 
 ### At compile time
 
+Pass compile definitions through the build command or CI `CXXFLAGS`.
+This overrides the defaults in `profiling_config.h` without changing source.
+
 ```bash
-# In CMakeLists.txt or build command
-add_definitions(-DPTO2_PROFILING=1)
-add_definitions(-DPTO2_ORCH_PROFILING=1)
+# Example: disable all profiling code
+CXXFLAGS="-DPTO2_PROFILING=0" pip install --no-build-isolation -e .
+
+# Example: enable orchestrator and tensormap profiling
+CXXFLAGS="-DPTO2_ORCH_PROFILING=1 -DPTO2_TENSORMAP_PROFILING=1" \
+    pip install --no-build-isolation -e .
 ```
 
 ### In source code (before including headers)
 
+Source-level overrides are only for local experiments. They must appear before
+any header includes `profiling_config.h`; do not add duplicated fallback
+definitions to runtime headers.
+
 ```cpp
 #define PTO2_PROFILING 1
 #define PTO2_ORCH_PROFILING 1
@@ -435,7 +450,7 @@ add_definitions(-DPTO2_ORCH_PROFILING=1)
 
 ### Code Locations
 
-- Macro definitions: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h`
+- Macro defaults and validation: `src/common/task_interface/profiling_config.h`
 - Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp`
 - Orchestrator profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp`
 - TensorMap profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h`

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -31,6 +31,7 @@
 
 #include <atomic>
 
+#include "profiling_config.h"
 #include "pto_constants.h"
 #include "pto_runtime_status.h"
 #include "pto2_dispatch_payload.h"
@@ -50,38 +51,6 @@
 #define SPIN_WAIT_HINT() ((void)0)
 #endif
 
-// =============================================================================
-// Profiling Configuration
-// =============================================================================
-
-#ifndef PTO2_PROFILING
-#define PTO2_PROFILING 1
-#endif
-
-#ifndef PTO2_ORCH_PROFILING
-#define PTO2_ORCH_PROFILING 0
-#endif
-
-#ifndef PTO2_SCHED_PROFILING
-#define PTO2_SCHED_PROFILING 0
-#endif
-
-#ifndef PTO2_TENSORMAP_PROFILING
-#define PTO2_TENSORMAP_PROFILING 0
-#endif
-
-#if PTO2_ORCH_PROFILING && !PTO2_PROFILING
-#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1"
-#endif
-
-#if PTO2_SCHED_PROFILING && !PTO2_PROFILING
-#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
-#endif
-
-#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING
-#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1"
-#endif
-
 #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
 #include "aicpu/device_time.h"
 #endif

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -43,6 +43,7 @@
 #pragma once
 
 #include "common.h"
+#include "profiling_config.h"
 #include "utils/device_arena.h"
 #include "pto_runtime2_types.h"
 #include "tensor.h"
@@ -64,13 +65,7 @@ struct PTO2TensorMapLayout {
     int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
 };
 
-// =============================================================================
-// TensorMap Lookup Profiling (must precede inline lookup/insert methods)
-// =============================================================================
-#ifndef PTO2_TENSORMAP_PROFILING
-#define PTO2_TENSORMAP_PROFILING 0
-#endif
-
+// TensorMap Lookup Profiling (must precede inline lookup/insert methods).
 #if PTO2_TENSORMAP_PROFILING
 extern uint64_t g_lookup_chain_total;
 extern uint64_t g_lookup_count;

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
@@ -35,16 +35,14 @@
 #include <arm_neon.h>
 #endif
 
+#include "aicpu/dump_arg_selection.h"
 #include "data_type.h"
+#include "profiling_config.h"
 #include "pto_submit_types.h"
 #include "task_args.h"
 #include "tensor.h"
 #include "tensor_arg.h"
 
-#ifndef PTO2_PROFILING
-#define PTO2_PROFILING 1
-#endif
-
 // Task arguments — alias the common CORE_MAX_* constants (single source of
 // truth in src/common/task_interface/arg_direction.h, transitively included
 // via task_args.h above). Keeping the MAX_TENSOR_ARGS / MAX_SCALAR_ARGS names
@@ -187,10 +185,7 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
     void clear() {
         TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType>::clear();
 #if PTO2_PROFILING
-        dump_arg_mask_ = 0;
-        dump_arg_index_ambiguous_mask_ = 0;
-        clear_scalar_sources();
-        memset(scalar_dtypes_, 0, sizeof(scalar_dtypes_));
+        dump_arg_selection_.clear();
 #endif
         explicit_deps_ = nullptr;
         explicit_dep_count_ = 0;
@@ -231,8 +226,10 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
     }
 
 #if PTO2_PROFILING
-    uint64_t tensor_dump_arg_mask() const { return dump_arg_mask_; }
-    uint64_t tensor_dump_arg_index_ambiguous_mask() const { return dump_arg_index_ambiguous_mask_; }
+    uint64_t tensor_dump_arg_mask() const { return dump_arg_selection_.dump_arg_mask(); }
+    uint64_t tensor_dump_arg_index_ambiguous_mask() const {
+        return dump_arg_selection_.dump_arg_index_ambiguous_mask();
+    }
 #else
     uint64_t tensor_dump_arg_mask() const { return 0; }
     uint64_t tensor_dump_arg_index_ambiguous_mask() const { return 0; }
@@ -348,8 +345,7 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
         }
         memcpy(&scalars_[scalar_count_], values, count * sizeof(uint64_t));
 #if PTO2_PROFILING
-        memset(&scalar_dtypes_[scalar_count_], 0, count * sizeof(uint8_t));
-        clear_scalar_sources(scalar_count_, count);
+        dump_arg_selection_.clear_scalar_metadata(scalar_count_, count);
 #endif
         scalar_count_ += count;
     }
@@ -384,8 +380,7 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
         }
 #endif
 #if PTO2_PROFILING
-        memset(&scalar_dtypes_[scalar_count_], 0, count * sizeof(uint8_t));
-        clear_scalar_sources(scalar_count_, count);
+        dump_arg_selection_.clear_scalar_metadata(scalar_count_, count);
 #endif
         scalar_count_ += count;
     }
@@ -395,7 +390,7 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
      * Useful when multiple tasks share the same scalar data (e.g., block indices).
      */
     void copy_scalars_from(const Arg &src, int src_offset, int count) {
-        if (count < 0 || src_offset + count > src.scalar_count_) {
+        if (src_offset < 0 || count < 0 || src_offset + count > src.scalar_count_) {
             set_error("Source scalar range out of bounds in copy_scalars_from");
             return;
         }
@@ -405,59 +400,42 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
         }
         memcpy(&scalars_[scalar_count_], &src.scalars_[src_offset], count * sizeof(uint64_t));
 #if PTO2_PROFILING
-        memcpy(&scalar_dtypes_[scalar_count_], &src.scalar_dtypes_[src_offset], count * sizeof(uint8_t));
-        clear_scalar_sources(scalar_count_, count);
+        dump_arg_selection_.copy_scalar_dtypes_from(src.dump_arg_selection_, scalar_count_, src_offset, count);
 #endif
         scalar_count_ += count;
     }
 
 #if PTO2_PROFILING
-    const uint8_t *scalar_dtypes() const { return scalar_dtypes_; }
+    const uint8_t *scalar_dtypes() const { return dump_arg_selection_.scalar_dtypes(); }
 #else
     const uint8_t *scalar_dtypes() const { return nullptr; }
 #endif
 
 private:
     // Caller-owned dependency array; lifetime must extend through submit.
 #if PTO2_PROFILING
-    static_assert(MAX_TENSOR_ARGS + MAX_SCALAR_ARGS <= 64, "dump arg mask assumes at most 64 arguments");
-    uint64_t dump_arg_mask_{0};
-    uint64_t dump_arg_index_ambiguous_mask_{0};
-    uintptr_t scalar_source_ptrs_[MAX_SCALAR_ARGS]{};
+    DumpArgSelection dump_arg_selection_;
 #endif
     const PTO2TaskId *explicit_deps_{nullptr};
     uint32_t explicit_dep_count_{0};
 #if PTO2_PROFILING
-    uint8_t scalar_dtypes_[MAX_SCALAR_ARGS] = {};
-
     template <typename T>
     static constexpr bool is_supported_dump_arg_v =
         std::is_same_v<std::decay_t<T>, Tensor> || std::is_same_v<std::decay_t<T>, TensorCreateInfo> ||
         is_supported_scalar_arg_v<T>;
-
-    void mark_arg_index(int32_t index) { dump_arg_mask_ |= (uint64_t{1} << index); }
-    void mark_arg_index_ambiguous(int32_t index) { dump_arg_index_ambiguous_mask_ |= (uint64_t{1} << index); }
-
-    void clear_scalar_sources() { clear_scalar_sources(0, MAX_SCALAR_ARGS); }
-
-    void clear_scalar_sources(int32_t start, int32_t count) {
-        for (int32_t i = 0; i < count; i++) {
-            scalar_source_ptrs_[start + i] = 0;
-        }
-    }
-
 #endif
 
     template <typename T>
     void add_scalar_one(T &&value) {
         scalars_[scalar_count_] = to_u64(value);
 #if PTO2_PROFILING
-        scalar_dtypes_[scalar_count_] = dtype_of<std::remove_cv_t<std::remove_reference_t<T>>>();
+        uintptr_t scalar_source_ptr = 0;
         if constexpr (std::is_lvalue_reference_v<T>) {
-            scalar_source_ptrs_[scalar_count_] = reinterpret_cast<uintptr_t>(&value);
-        } else {
-            scalar_source_ptrs_[scalar_count_] = 0;
+            scalar_source_ptr = reinterpret_cast<uintptr_t>(&value);
         }
+        dump_arg_selection_.record_scalar_source(
+            scalar_count_, scalar_source_ptr, dtype_of<std::remove_cv_t<std::remove_reference_t<T>>>()
+        );
 #endif
         scalar_count_++;
     }
@@ -469,18 +447,13 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
             set_error("dump: no arguments added to this Arg");
             return;
         }
-        for (int32_t i = 0; i < tensor_count_; i++) {
-            mark_arg_index(i);
-        }
-        for (int32_t i = 0; i < scalar_count_; i++) {
-            mark_arg_index(tensor_count_ + i);
-        }
+        dump_arg_selection_.mark_all(tensor_count_, scalar_count_);
     }
 
     void mark_dump_arg(const Tensor &tensor) {
         for (int32_t i = 0; i < tensor_count_; i++) {
             if (tags_[i] != TensorArgType::OUTPUT && tensors_[i].ptr == &tensor) {
-                mark_arg_index(i);
+                dump_arg_selection_.mark_index(i);
                 return;
             }
         }
@@ -490,7 +463,7 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
     void mark_dump_arg(const TensorCreateInfo &create_info) {
         for (int32_t i = 0; i < tensor_count_; i++) {
             if (tags_[i] == TensorArgType::OUTPUT && tensors_[i].create_info == &create_info) {
-                mark_arg_index(i);
+                dump_arg_selection_.mark_index(i);
                 return;
             }
         }
@@ -500,22 +473,7 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
     template <typename T>
     std::enable_if_t<is_supported_scalar_arg_v<T>, void> mark_dump_arg(const T &scalar) {
         uintptr_t ptr = reinterpret_cast<uintptr_t>(&scalar);
-        int32_t first_match = -1;
-        int32_t match_count = 0;
-        for (int32_t i = 0; i < scalar_count_; i++) {
-            if (scalar_source_ptrs_[i] == ptr) {
-                if (first_match < 0) {
-                    first_match = i;
-                }
-                match_count++;
-            }
-        }
-        if (first_match >= 0) {
-            int32_t arg_index = tensor_count_ + first_match;
-            mark_arg_index(arg_index);
-            if (match_count > 1) {
-                mark_arg_index_ambiguous(arg_index);
-            }
+        if (dump_arg_selection_.mark_scalar_by_ptr(ptr, scalar_count_, tensor_count_)) {
             return;
         }
         set_error("dump: scalar is not part of this Arg");

diff --git a/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h
@@ -12,13 +12,7 @@
 #ifndef SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
 #define SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
 
-// =============================================================================
-// Profiling Configuration
-// =============================================================================
-
-#ifndef PTO2_PROFILING
-#define PTO2_PROFILING 1
-#endif
+#include "profiling_config.h"
 
 // =============================================================================
 // Tensor Dump Configuration