From 1266722189ff2cb37620fa421a419e134a72f9ec Mon Sep 17 00:00:00 2001 From: zm Date: Wed, 17 Jun 2026 15:32:01 +0800 Subject: [PATCH] Refactor: share dump arg selection state - Centralize PTO2 profiling macro defaults in profiling_config.h - Move tensor/scalar dump selection metadata into DumpArgSelection - Reuse the shared helper from a2a3 and a5 Arg implementations - Update profiling and tensor dump docs for the new macro source --- .github/workflows/ci.yml | 10 +- docs/dfx/l2-timing.md | 5 +- docs/dfx/tensor-dump.md | 5 + .../runtime/pto_runtime2_types.h | 8 +- .../docs/profiling_levels.md | 23 ++- .../runtime/pto_runtime2_types.h | 33 +--- .../runtime/pto_tensormap.h | 9 +- .../runtime/pto_types.h | 86 +++------- .../runtime/pto_runtime2_types.h | 8 +- .../host_build_graph/runtime/tensor_info.h | 9 +- .../docs/profiling_levels.md | 23 ++- .../runtime/pto_runtime2_types.h | 33 +--- .../runtime/pto_tensormap.h | 9 +- .../runtime/pto_types.h | 86 +++------- .../include/aicpu/dump_arg_selection.h | 162 ++++++++++++++++++ src/common/task_interface/profiling_config.h | 43 +++++ 16 files changed, 309 insertions(+), 243 deletions(-) create mode 100644 src/common/platform/include/aicpu/dump_arg_selection.h create mode 100644 src/common/task_interface/profiling_config.h diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index adf44518d..0dc6e5a99 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -301,11 +301,11 @@ jobs: # ---------- Profiling sub-flags smoke (compile + run) ---------- # PTO2_PROFILING / PTO2_ORCH_PROFILING / PTO2_SCHED_PROFILING / - # PTO2_TENSORMAP_PROFILING are compile-time gates in src/{a2a3,a5}/runtime/ - # tensormap_and_ringbuffer/runtime/pto_runtime2_types.h. The defaults - # (PTO2_PROFILING=1, sub-flags=0) are exercised by every CI job, but the - # non-default branches are dead coverage today — a developer flipping any - # of them for perf debugging or to minimize logging overhead had no gate + # PTO2_TENSORMAP_PROFILING are compile-time gates whose defaults and + # dependency checks live in src/common/task_interface/profiling_config.h. + # The defaults (PTO2_PROFILING=1, sub-flags=0) are exercised by every CI job, + # but non-default branches are otherwise dead coverage: a developer flipping + # any of them for perf debugging or to minimize logging overhead had no gate # protecting the gated code from drift (renamed fields, changed signatures, # format-string mismatches in LOG_INFO summaries, dead parameters caught # only by -Wunused-parameter -Werror). diff --git a/docs/dfx/l2-timing.md b/docs/dfx/l2-timing.md index 337406311..870ff418f 100644 --- a/docs/dfx/l2-timing.md +++ b/docs/dfx/l2-timing.md @@ -2,8 +2,9 @@ For an L2 run you usually look at a handful of timing numbers. They come from **two channels**, both available with no extra flags because they ride on the -compile-time `PTO2_PROFILING` macro (default `1`, already in the prebuilt -runtimes): +compile-time `PTO2_PROFILING` macro (default `1` in +`src/common/task_interface/profiling_config.h`, already enabled in the +prebuilt runtimes): 1. **`RunTiming`** — `host_wall` and `device_wall`, returned directly by `Worker.run()` and printed per round by the harness. diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md index edef2dc1a..a2eef2bb2 100644 --- a/docs/dfx/tensor-dump.md +++ b/docs/dfx/tensor-dump.md @@ -324,6 +324,11 @@ What you can read out of `tensor_dump.json` + `tensor_dump.bin`: ## 5. Design Highlights +`Arg::dump(...)` selection state is compiled only when +`PTO2_PROFILING=1`. With `PTO2_PROFILING=0`, the public API remains +available but acts as a no-op: no dump-only `Arg` state is stored and +submit does not propagate dump metadata. + ### 5.1 Common device-side structures Both architectures share the same device-side layout, published via diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h index 4d4bb9313..82bb7c193 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h @@ -12,13 +12,7 @@ #ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ #define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ -// ============================================================================= -// Profiling Configuration -// ============================================================================= - -#ifndef PTO2_PROFILING -#define PTO2_PROFILING 1 -#endif +#include "profiling_config.h" // ============================================================================= // Tensor Dump Configuration diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index b74a2fa6a..bd669f365 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -8,6 +8,11 @@ PTO Runtime2 uses a hierarchical profiling system with compile-time macros to co ## Profiling Macro Hierarchy +Defaults and dependency validation are centralized in +`src/common/task_interface/profiling_config.h`. Runtime headers include that +file before using the macros, so both a2a3 and a5 share the same default +values and compile-time checks. + ```text PTO2_PROFILING (base level, default=1) ├── PTO2_ORCH_PROFILING (orchestrator, default=0, requires PTO2_PROFILING=1) @@ -385,14 +390,24 @@ PTO2_TENSORMAP_PROFILING=1 ### At compile time +Pass compile definitions through the build command or CI `CXXFLAGS`. +This overrides the defaults in `profiling_config.h` without changing source. + ```bash -# In CMakeLists.txt or build command -add_definitions(-DPTO2_PROFILING=1) -add_definitions(-DPTO2_ORCH_PROFILING=1) +# Example: disable all profiling code +CXXFLAGS="-DPTO2_PROFILING=0" pip install --no-build-isolation -e . + +# Example: enable orchestrator and tensormap profiling +CXXFLAGS="-DPTO2_ORCH_PROFILING=1 -DPTO2_TENSORMAP_PROFILING=1" \ + pip install --no-build-isolation -e . ``` ### In source code (before including headers) +Source-level overrides are only for local experiments. They must appear before +any header includes `profiling_config.h`; do not add duplicated fallback +definitions to runtime headers. + ```cpp #define PTO2_PROFILING 1 #define PTO2_ORCH_PROFILING 1 @@ -435,7 +450,7 @@ add_definitions(-DPTO2_ORCH_PROFILING=1) ### Code Locations -- Macro definitions: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h` +- Macro defaults and validation: `src/common/task_interface/profiling_config.h` - Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp` - Orchestrator profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp` - TensorMap profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h` diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index ecda02555..202a7cf89 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -31,6 +31,7 @@ #include +#include "profiling_config.h" #include "pto_constants.h" #include "pto_runtime_status.h" #include "pto2_dispatch_payload.h" @@ -50,38 +51,6 @@ #define SPIN_WAIT_HINT() ((void)0) #endif -// ============================================================================= -// Profiling Configuration -// ============================================================================= - -#ifndef PTO2_PROFILING -#define PTO2_PROFILING 1 -#endif - -#ifndef PTO2_ORCH_PROFILING -#define PTO2_ORCH_PROFILING 0 -#endif - -#ifndef PTO2_SCHED_PROFILING -#define PTO2_SCHED_PROFILING 0 -#endif - -#ifndef PTO2_TENSORMAP_PROFILING -#define PTO2_TENSORMAP_PROFILING 0 -#endif - -#if PTO2_ORCH_PROFILING && !PTO2_PROFILING -#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1" -#endif - -#if PTO2_SCHED_PROFILING && !PTO2_PROFILING -#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1" -#endif - -#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING -#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1" -#endif - #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING #include "aicpu/device_time.h" #endif diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 875b79bbe..7acaa046f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -43,6 +43,7 @@ #pragma once #include "common.h" +#include "profiling_config.h" #include "utils/device_arena.h" #include "pto_runtime2_types.h" #include "tensor.h" @@ -64,13 +65,7 @@ struct PTO2TensorMapLayout { int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; }; -// ============================================================================= -// TensorMap Lookup Profiling (must precede inline lookup/insert methods) -// ============================================================================= -#ifndef PTO2_TENSORMAP_PROFILING -#define PTO2_TENSORMAP_PROFILING 0 -#endif - +// TensorMap Lookup Profiling (must precede inline lookup/insert methods). #if PTO2_TENSORMAP_PROFILING extern uint64_t g_lookup_chain_total; extern uint64_t g_lookup_count; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h index 16dc796ea..75c165078 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h @@ -35,16 +35,14 @@ #include #endif +#include "aicpu/dump_arg_selection.h" #include "data_type.h" +#include "profiling_config.h" #include "pto_submit_types.h" #include "task_args.h" #include "tensor.h" #include "tensor_arg.h" -#ifndef PTO2_PROFILING -#define PTO2_PROFILING 1 -#endif - // Task arguments — alias the common CORE_MAX_* constants (single source of // truth in src/common/task_interface/arg_direction.h, transitively included // via task_args.h above). Keeping the MAX_TENSOR_ARGS / MAX_SCALAR_ARGS names @@ -187,10 +185,7 @@ struct Arg : TaskArgsTpl::clear(); #if PTO2_PROFILING - dump_arg_mask_ = 0; - dump_arg_index_ambiguous_mask_ = 0; - clear_scalar_sources(); - memset(scalar_dtypes_, 0, sizeof(scalar_dtypes_)); + dump_arg_selection_.clear(); #endif explicit_deps_ = nullptr; explicit_dep_count_ = 0; @@ -231,8 +226,10 @@ struct Arg : TaskArgsTpl src.scalar_count_) { + if (src_offset < 0 || count < 0 || src_offset + count > src.scalar_count_) { set_error("Source scalar range out of bounds in copy_scalars_from"); return; } @@ -405,14 +400,13 @@ struct Arg : TaskArgsTpl static constexpr bool is_supported_dump_arg_v = std::is_same_v, Tensor> || std::is_same_v, TensorCreateInfo> || is_supported_scalar_arg_v; - - void mark_arg_index(int32_t index) { dump_arg_mask_ |= (uint64_t{1} << index); } - void mark_arg_index_ambiguous(int32_t index) { dump_arg_index_ambiguous_mask_ |= (uint64_t{1} << index); } - - void clear_scalar_sources() { clear_scalar_sources(0, MAX_SCALAR_ARGS); } - - void clear_scalar_sources(int32_t start, int32_t count) { - for (int32_t i = 0; i < count; i++) { - scalar_source_ptrs_[start + i] = 0; - } - } - #endif template void add_scalar_one(T &&value) { scalars_[scalar_count_] = to_u64(value); #if PTO2_PROFILING - scalar_dtypes_[scalar_count_] = dtype_of>>(); + uintptr_t scalar_source_ptr = 0; if constexpr (std::is_lvalue_reference_v) { - scalar_source_ptrs_[scalar_count_] = reinterpret_cast(&value); - } else { - scalar_source_ptrs_[scalar_count_] = 0; + scalar_source_ptr = reinterpret_cast(&value); } + dump_arg_selection_.record_scalar_source( + scalar_count_, scalar_source_ptr, dtype_of>>() + ); #endif scalar_count_++; } @@ -469,18 +447,13 @@ struct Arg : TaskArgsTpl std::enable_if_t, void> mark_dump_arg(const T &scalar) { uintptr_t ptr = reinterpret_cast(&scalar); - int32_t first_match = -1; - int32_t match_count = 0; - for (int32_t i = 0; i < scalar_count_; i++) { - if (scalar_source_ptrs_[i] == ptr) { - if (first_match < 0) { - first_match = i; - } - match_count++; - } - } - if (first_match >= 0) { - int32_t arg_index = tensor_count_ + first_match; - mark_arg_index(arg_index); - if (match_count > 1) { - mark_arg_index_ambiguous(arg_index); - } + if (dump_arg_selection_.mark_scalar_by_ptr(ptr, scalar_count_, tensor_count_)) { return; } set_error("dump: scalar is not part of this Arg"); diff --git a/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h index 5bf0057c6..8735a5adb 100644 --- a/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h +++ b/src/a5/runtime/host_build_graph/runtime/pto_runtime2_types.h @@ -12,13 +12,7 @@ #ifndef SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ #define SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_ -// ============================================================================= -// Profiling Configuration -// ============================================================================= - -#ifndef PTO2_PROFILING -#define PTO2_PROFILING 1 -#endif +#include "profiling_config.h" // ============================================================================= // Tensor Dump Configuration diff --git a/src/a5/runtime/host_build_graph/runtime/tensor_info.h b/src/a5/runtime/host_build_graph/runtime/tensor_info.h index fe93f90fc..3fc4ae139 100644 --- a/src/a5/runtime/host_build_graph/runtime/tensor_info.h +++ b/src/a5/runtime/host_build_graph/runtime/tensor_info.h @@ -16,16 +16,9 @@ #include "common/platform_config.h" #include "data_type.h" +#include "profiling_config.h" #include "tensor_arg.h" -// ============================================================================= -// Profiling Configuration -// ============================================================================= - -#ifndef PTO2_PROFILING -#define PTO2_PROFILING 1 -#endif - struct TensorInfo { DataType dtype; uint8_t ndims; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md index af28a2de9..2ef6c1b6a 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md +++ b/src/a5/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md @@ -8,6 +8,11 @@ PTO Runtime2 uses a hierarchical profiling system with compile-time macros to co ## Profiling Macro Hierarchy +Defaults and dependency validation are centralized in +`src/common/task_interface/profiling_config.h`. Runtime headers include that +file before using the macros, so both a2a3 and a5 share the same default +values and compile-time checks. + ```text PTO2_PROFILING (base level, default=1) ├── PTO2_ORCH_PROFILING (orchestrator, default=0, requires PTO2_PROFILING=1) @@ -355,14 +360,24 @@ PTO2_TENSORMAP_PROFILING=1 ### At compile time +Pass compile definitions through the build command or CI `CXXFLAGS`. +This overrides the defaults in `profiling_config.h` without changing source. + ```bash -# In CMakeLists.txt or build command -add_definitions(-DPTO2_PROFILING=1) -add_definitions(-DPTO2_ORCH_PROFILING=1) +# Example: disable all profiling code +CXXFLAGS="-DPTO2_PROFILING=0" pip install --no-build-isolation -e . + +# Example: enable orchestrator and tensormap profiling +CXXFLAGS="-DPTO2_ORCH_PROFILING=1 -DPTO2_TENSORMAP_PROFILING=1" \ + pip install --no-build-isolation -e . ``` ### In source code (before including headers) +Source-level overrides are only for local experiments. They must appear before +any header includes `profiling_config.h`; do not add duplicated fallback +definitions to runtime headers. + ```cpp #define PTO2_PROFILING 1 #define PTO2_ORCH_PROFILING 1 @@ -405,7 +420,7 @@ add_definitions(-DPTO2_ORCH_PROFILING=1) ### Code Locations -- Macro definitions: `src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h` +- Macro defaults and validation: `src/common/task_interface/profiling_config.h` - Scheduler profiling: `src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp` - Orchestrator profiling: `src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp` - TensorMap profiling: `src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h` diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index b1399d6af..d401557f6 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -31,6 +31,7 @@ #include +#include "profiling_config.h" #include "pto_constants.h" #include "pto_runtime_status.h" #include "pto2_dispatch_payload.h" @@ -50,38 +51,6 @@ #define SPIN_WAIT_HINT() ((void)0) #endif -// ============================================================================= -// Profiling Configuration -// ============================================================================= - -#ifndef PTO2_PROFILING -#define PTO2_PROFILING 1 -#endif - -#ifndef PTO2_ORCH_PROFILING -#define PTO2_ORCH_PROFILING 0 -#endif - -#ifndef PTO2_SCHED_PROFILING -#define PTO2_SCHED_PROFILING 0 -#endif - -#ifndef PTO2_TENSORMAP_PROFILING -#define PTO2_TENSORMAP_PROFILING 0 -#endif - -#if PTO2_ORCH_PROFILING && !PTO2_PROFILING -#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1" -#endif - -#if PTO2_SCHED_PROFILING && !PTO2_PROFILING -#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1" -#endif - -#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING -#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1" -#endif - #if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING #include "aicpu/device_time.h" #endif diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 875b79bbe..7acaa046f 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -43,6 +43,7 @@ #pragma once #include "common.h" +#include "profiling_config.h" #include "utils/device_arena.h" #include "pto_runtime2_types.h" #include "tensor.h" @@ -64,13 +65,7 @@ struct PTO2TensorMapLayout { int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; }; -// ============================================================================= -// TensorMap Lookup Profiling (must precede inline lookup/insert methods) -// ============================================================================= -#ifndef PTO2_TENSORMAP_PROFILING -#define PTO2_TENSORMAP_PROFILING 0 -#endif - +// TensorMap Lookup Profiling (must precede inline lookup/insert methods). #if PTO2_TENSORMAP_PROFILING extern uint64_t g_lookup_chain_total; extern uint64_t g_lookup_count; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h index c9136f8ab..339e32b5e 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h @@ -35,16 +35,14 @@ #include #endif +#include "aicpu/dump_arg_selection.h" #include "data_type.h" +#include "profiling_config.h" #include "pto_submit_types.h" #include "task_args.h" #include "tensor.h" #include "tensor_arg.h" -#ifndef PTO2_PROFILING -#define PTO2_PROFILING 1 -#endif - // Task arguments — alias the common CORE_MAX_* constants (single source of // truth in src/common/task_interface/arg_direction.h, transitively included // via task_args.h above). Keeping the MAX_TENSOR_ARGS / MAX_SCALAR_ARGS names @@ -185,10 +183,7 @@ struct Arg : TaskArgsTpl::clear(); #if PTO2_PROFILING - dump_arg_mask_ = 0; - dump_arg_index_ambiguous_mask_ = 0; - clear_scalar_sources(); - memset(scalar_dtypes_, 0, sizeof(scalar_dtypes_)); + dump_arg_selection_.clear(); #endif explicit_deps_ = nullptr; explicit_dep_count_ = 0; @@ -229,8 +224,10 @@ struct Arg : TaskArgsTpl src.scalar_count_) { + if (src_offset < 0 || count < 0 || src_offset + count > src.scalar_count_) { set_error("Source scalar range out of bounds in copy_scalars_from"); return; } @@ -403,14 +398,13 @@ struct Arg : TaskArgsTpl static constexpr bool is_supported_dump_arg_v = std::is_same_v, Tensor> || std::is_same_v, TensorCreateInfo> || is_supported_scalar_arg_v; - - void mark_arg_index(int32_t index) { dump_arg_mask_ |= (uint64_t{1} << index); } - void mark_arg_index_ambiguous(int32_t index) { dump_arg_index_ambiguous_mask_ |= (uint64_t{1} << index); } - - void clear_scalar_sources() { clear_scalar_sources(0, MAX_SCALAR_ARGS); } - - void clear_scalar_sources(int32_t start, int32_t count) { - for (int32_t i = 0; i < count; i++) { - scalar_source_ptrs_[start + i] = 0; - } - } - #endif template void add_scalar_one(T &&value) { scalars_[scalar_count_] = to_u64(value); #if PTO2_PROFILING - scalar_dtypes_[scalar_count_] = dtype_of>>(); + uintptr_t scalar_source_ptr = 0; if constexpr (std::is_lvalue_reference_v) { - scalar_source_ptrs_[scalar_count_] = reinterpret_cast(&value); - } else { - scalar_source_ptrs_[scalar_count_] = 0; + scalar_source_ptr = reinterpret_cast(&value); } + dump_arg_selection_.record_scalar_source( + scalar_count_, scalar_source_ptr, dtype_of>>() + ); #endif scalar_count_++; } @@ -467,18 +445,13 @@ struct Arg : TaskArgsTpl std::enable_if_t, void> mark_dump_arg(const T &scalar) { uintptr_t ptr = reinterpret_cast(&scalar); - int32_t first_match = -1; - int32_t match_count = 0; - for (int32_t i = 0; i < scalar_count_; i++) { - if (scalar_source_ptrs_[i] == ptr) { - if (first_match < 0) { - first_match = i; - } - match_count++; - } - } - if (first_match >= 0) { - int32_t arg_index = tensor_count_ + first_match; - mark_arg_index(arg_index); - if (match_count > 1) { - mark_arg_index_ambiguous(arg_index); - } + if (dump_arg_selection_.mark_scalar_by_ptr(ptr, scalar_count_, tensor_count_)) { return; } set_error("dump: scalar is not part of this Arg"); diff --git a/src/common/platform/include/aicpu/dump_arg_selection.h b/src/common/platform/include/aicpu/dump_arg_selection.h new file mode 100644 index 000000000..7fe7d03ff --- /dev/null +++ b/src/common/platform/include/aicpu/dump_arg_selection.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_COMMON_PLATFORM_INCLUDE_AICPU_DUMP_ARG_SELECTION_H_ +#define SRC_COMMON_PLATFORM_INCLUDE_AICPU_DUMP_ARG_SELECTION_H_ + +#include +#include + +#include "arg_direction.h" + +struct DumpArgSelection { + static_assert(CORE_MAX_TENSOR_ARGS + CORE_MAX_SCALAR_ARGS <= 64, "dump arg mask assumes at most 64 arguments"); + + void clear() { + dump_arg_mask_ = 0; + dump_arg_index_ambiguous_mask_ = 0; + clear_scalar_sources(); + clear_scalar_dtypes(0, CORE_MAX_SCALAR_ARGS); + } + + uint64_t dump_arg_mask() const { return dump_arg_mask_; } + uint64_t dump_arg_index_ambiguous_mask() const { return dump_arg_index_ambiguous_mask_; } + const uint8_t *scalar_dtypes() const { return scalar_dtypes_; } + + void mark_index(int32_t index) { + if (!is_valid_arg_index(index)) { + return; + } + dump_arg_mask_ |= (uint64_t{1} << index); + } + + void mark_all(int32_t tensor_count, int32_t scalar_count) { + if (!is_valid_tensor_count(tensor_count) || !is_valid_scalar_range(0, scalar_count)) { + return; + } + for (int32_t i = 0; i < tensor_count; i++) { + mark_index(i); + } + for (int32_t i = 0; i < scalar_count; i++) { + mark_index(tensor_count + i); + } + } + + bool mark_scalar_by_ptr(uintptr_t ptr, int32_t scalar_count, int32_t tensor_offset) { + if (!is_valid_scalar_range(0, scalar_count) || !is_valid_tensor_count(tensor_offset) || + !is_valid_arg_range(tensor_offset, scalar_count)) { + return false; + } + + int32_t first_match = -1; + int32_t match_count = 0; + for (int32_t i = 0; i < scalar_count; i++) { + if (scalar_source_ptrs_[i] == ptr) { + if (first_match < 0) { + first_match = i; + } + match_count++; + } + } + if (first_match < 0) { + return false; + } + + int32_t arg_index = tensor_offset + first_match; + if (!is_valid_arg_index(arg_index)) { + return false; + } + mark_index(arg_index); + if (match_count > 1) { + mark_index_ambiguous(arg_index); + } + return true; + } + + void record_scalar_source(int32_t slot, uintptr_t ptr, uint8_t dtype) { + if (!is_valid_scalar_range(slot, 1)) { + return; + } + scalar_source_ptrs_[slot] = ptr; + scalar_dtypes_[slot] = dtype; + } + + void clear_scalar_metadata(int32_t start, int32_t count) { + if (!is_valid_scalar_range(start, count)) { + return; + } + clear_scalar_dtypes(start, count); + clear_scalar_sources(start, count); + } + + void copy_scalar_dtypes_from(const DumpArgSelection &src, int32_t dst_offset, int32_t src_offset, int32_t count) { + if (!is_valid_scalar_range(dst_offset, count) || !is_valid_scalar_range(src_offset, count)) { + return; + } + memcpy(&scalar_dtypes_[dst_offset], &src.scalar_dtypes_[src_offset], count * sizeof(uint8_t)); + clear_scalar_sources(dst_offset, count); + } + +private: + static constexpr int32_t kDumpArgBitCount = 64; + + static bool is_valid_arg_index(int32_t index) { return index >= 0 && index < kDumpArgBitCount; } + + static bool is_valid_arg_range(int32_t start, int32_t count) { + if (start < 0 || count < 0 || count > kDumpArgBitCount) { + return false; + } + return start <= kDumpArgBitCount - count; + } + + static bool is_valid_tensor_count(int32_t tensor_count) { + return tensor_count >= 0 && tensor_count <= CORE_MAX_TENSOR_ARGS; + } + + static bool is_valid_scalar_range(int32_t start, int32_t count) { + if (start < 0 || count < 0 || count > CORE_MAX_SCALAR_ARGS) { + return false; + } + return start <= CORE_MAX_SCALAR_ARGS - count; + } + + void mark_index_ambiguous(int32_t index) { + if (!is_valid_arg_index(index)) { + return; + } + dump_arg_index_ambiguous_mask_ |= (uint64_t{1} << index); + } + + void clear_scalar_sources() { clear_scalar_sources(0, CORE_MAX_SCALAR_ARGS); } + + void clear_scalar_sources(int32_t start, int32_t count) { + if (!is_valid_scalar_range(start, count)) { + return; + } + for (int32_t i = 0; i < count; i++) { + scalar_source_ptrs_[start + i] = 0; + } + } + + void clear_scalar_dtypes(int32_t start, int32_t count) { + if (!is_valid_scalar_range(start, count)) { + return; + } + memset(&scalar_dtypes_[start], 0, count * sizeof(uint8_t)); + } + + uint64_t dump_arg_mask_{0}; + uint64_t dump_arg_index_ambiguous_mask_{0}; + uintptr_t scalar_source_ptrs_[CORE_MAX_SCALAR_ARGS]{}; + uint8_t scalar_dtypes_[CORE_MAX_SCALAR_ARGS]{}; +}; + +#endif // SRC_COMMON_PLATFORM_INCLUDE_AICPU_DUMP_ARG_SELECTION_H_ diff --git a/src/common/task_interface/profiling_config.h b/src/common/task_interface/profiling_config.h new file mode 100644 index 000000000..a81380030 --- /dev/null +++ b/src/common/task_interface/profiling_config.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_COMMON_TASK_INTERFACE_PROFILING_CONFIG_H_ +#define SRC_COMMON_TASK_INTERFACE_PROFILING_CONFIG_H_ + +#ifndef PTO2_PROFILING +#define PTO2_PROFILING 1 +#endif + +#ifndef PTO2_ORCH_PROFILING +#define PTO2_ORCH_PROFILING 0 +#endif + +#ifndef PTO2_SCHED_PROFILING +#define PTO2_SCHED_PROFILING 0 +#endif + +#ifndef PTO2_TENSORMAP_PROFILING +#define PTO2_TENSORMAP_PROFILING 0 +#endif + +#if PTO2_ORCH_PROFILING && !PTO2_PROFILING +#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1" +#endif + +#if PTO2_SCHED_PROFILING && !PTO2_PROFILING +#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1" +#endif + +#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING +#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1" +#endif + +#endif // SRC_COMMON_TASK_INTERFACE_PROFILING_CONFIG_H_