diff --git a/CMakeLists.txt b/CMakeLists.txt index 91c2b015..bc2fe715 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,12 +18,20 @@ option(WITH_ASCEND "Enable Ascend backend" OFF) option(WITH_TORCH "Enable PyTorch C++ backend" OFF) -# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for -# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed -# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the -# toolchain is compatible or when building via the standalone -# `src/ascend/custom/build.sh` script. -option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires `torch_npu`)" OFF) +# Custom `AscendC` kernels under `src/native/ascend/custom/`. `ON` by default +# so CI and routine dev builds always exercise `implementation_index=1/2` +# for `RmsNorm` / `AddRmsNorm`. Gated by `WITH_ASCEND` in +# `src/CMakeLists.txt`, so non-Ascend builds ignore it. Pass +# `-DBUILD_ASCEND_CUSTOM=OFF` to skip the `ccec` build on Ascend +# machines where the custom kernels aren't needed. +# +# When `ON`, `src/CMakeLists.txt` drives the standalone +# `src/native/ascend/custom/build.sh` via a build-phase custom command. This +# sidesteps a `CANN` `extract_host_stub.py` path bug that breaks in-tree +# `ascendc_library()` under `scikit-build-core` temp-dir builds, then links +# the produced `libno_workspace_kernel.a` into the `ops` module with +# `--whole-archive`. Requires `torch_npu` and the `AscendC` toolchain (`ccec`). +option(BUILD_ASCEND_CUSTOM "Build custom AscendC kernels" ON) option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF) option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ce888b4b..924a4576 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -241,8 +241,70 @@ if(WITH_ASCEND) list(APPEND DEVICE_LIST "ascend") # Custom `AscendC` kernels (PyTorch extension, requires `torch_npu`). - if(BUILD_CUSTOM_KERNEL) - add_subdirectory(native/ascend/custom) + if(BUILD_ASCEND_CUSTOM) + # In-tree `ascendc_library()` trips the `CANN` `extract_host_stub.py` + # path-handling bug under `scikit-build-core`'s temp-dir builds + # (`KeyError` on `/./workspace/...` paths in `$`). + # Work around it by driving the standalone `src/native/ascend/custom/build.sh`; + # that script invokes a separate `cmake` with + # `src/native/ascend/custom/` as its `SOURCE_DIR`, avoiding the buggy + # path shape. The produced `.a` is imported and linked into + # `ops` with `--whole-archive`. + set(_custom_build_dir "${CMAKE_SOURCE_DIR}/build/build_ascend_custom") + set(_custom_lib "${_custom_build_dir}/lib/libno_workspace_kernel.a") + set(_custom_source_dir "${CMAKE_CURRENT_BINARY_DIR}/ascend_custom_source") + + if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "") + include(${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom/cmake/detect_soc.cmake) + infiniops_detect_soc(SOC_VERSION) + endif() + + # Drive `build.sh` as a build-phase target with explicit source + # dependencies so that editing any `op_host/` or `op_kernel/` + # source re-triggers the build (plain `execute_process` at + # configure time would only gate on file existence and leave + # stale `.a` files in place). + file(GLOB_RECURSE _custom_srcs CONFIGURE_DEPENDS + "${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom/*.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom/*.h" + "${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom/build.sh") + + # Scrub env inherited from the outer `scikit-build-core` invocation + # before handing control to `build.sh`: `CMAKE_GENERATOR` / + # `CMAKE_EXPORT_COMPILE_COMMANDS` leaking into the inner `cmake` + # change the path format passed to `ninja`'s `_host_cpp` rule and + # re-trigger the `CANN` `extract_host_stub.py` `KeyError` + # (`/./workspace/...`) that standalone `build.sh` avoids. + # + # `pip install` MUST be invoked with `--no-build-isolation` on + # Ascend; otherwise pip's build-isolation overlay shadows system + # `torch` (via `PYTHONPATH`) and the inner `cmake`'s + # `import torch` in `config_envs.cmake` fails with + # `ModuleNotFoundError`. + add_custom_command( + OUTPUT ${_custom_lib} + COMMAND ${CMAKE_COMMAND} -E rm -f "${_custom_source_dir}" + COMMAND ${CMAKE_COMMAND} -E create_symlink + "${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom" + "${_custom_source_dir}" + COMMAND ${CMAKE_COMMAND} -E env + --unset=CMAKE_GENERATOR + --unset=CMAKE_EXPORT_COMPILE_COMMANDS + --unset=CMAKE_BUILD_PARALLEL_LEVEL + "BUILD_DIR=${_custom_build_dir}" + "MAIN_SRC_DIR=${CMAKE_CURRENT_SOURCE_DIR}" + bash ${_custom_source_dir}/build.sh ${SOC_VERSION} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom + DEPENDS ${_custom_srcs} + COMMENT "Building custom AscendC kernels (SOC_VERSION=${SOC_VERSION})" + VERBATIM) + + add_custom_target(no_workspace_kernel_build ALL DEPENDS ${_custom_lib}) + + add_library(no_workspace_kernel STATIC IMPORTED GLOBAL) + set_target_properties(no_workspace_kernel PROPERTIES + IMPORTED_LOCATION "${_custom_lib}") + add_dependencies(no_workspace_kernel no_workspace_kernel_build) # Link the compiled `AscendC` kernel objects into `infiniops` so that # custom kernel implementations (e.g. `RmsNorm` index 1) can call @@ -379,9 +441,17 @@ if(GENERATE_PYTHON_BINDINGS) # The `Operator<..., 1>` template instantiations that call # `aclrtlaunch_*` live in `ops.cc`, so link here with # `--whole-archive` to ensure all launch functions are available. - if(BUILD_CUSTOM_KERNEL) + # `$` works for both real `ascendc_library()` targets and + # `IMPORTED` targets pointing at a pre-built `.a`. The + # `no_workspace_kernel` target is only created inside the + # `WITH_ASCEND` block above, so this branch must mirror that gate; + # otherwise non-Ascend builds error out with "No target + # no_workspace_kernel". + if(WITH_ASCEND AND BUILD_ASCEND_CUSTOM) target_link_libraries(ops PRIVATE - -Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive) + -Wl,--whole-archive $ -Wl,--no-whole-archive) + # `ops` link step must wait for `build.sh` to produce the `.a`. + add_dependencies(ops no_workspace_kernel_build) endif() set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN") diff --git a/src/native/ascend/custom/CMakeLists.txt b/src/native/ascend/custom/CMakeLists.txt index ca6e6883..154720aa 100644 --- a/src/native/ascend/custom/CMakeLists.txt +++ b/src/native/ascend/custom/CMakeLists.txt @@ -30,8 +30,8 @@ else() endif() set(PROJECT_OP_SRC_BASE ${PROJECT_SOURCE_DIR}) -set(PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/build) -set(PROJECT_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/output) +set(MAIN_SRC_DIR "${PROJECT_OP_SRC_BASE}/../.." CACHE PATH + "Main InfiniOps source directory.") include(cmake/config_envs.cmake) include(cmake/config_ascend.cmake) @@ -43,13 +43,15 @@ if(CCACHE_PROGRAM) set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") endif() -# Shared library output location. -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_PATH}) +# `CMAKE_LIBRARY_OUTPUT_DIRECTORY` is set by `build.sh` so that the +# standalone `libascend_kernel.so` lands next to `libno_workspace_kernel.a` +# under `/build/build_ascend_custom/output/`. # Host-side files. file(GLOB OP_SRCS ${PROJECT_OP_SRC_BASE}/torch_binding.cpp ${PROJECT_OP_SRC_BASE}/rms_norm/op_host/rms_norm.cpp + ${PROJECT_OP_SRC_BASE}/add_rms_norm/op_host/add_rms_norm.cpp ) # Shared library name — consumed by `kernel_custom.h` variants and by the @@ -58,9 +60,19 @@ set(OP_PLUGIN_NAME ascend_kernel) # Kernel-side files (device code compiled by the `AscendC` toolchain). ascendc_library(no_workspace_kernel STATIC - ${PROJECT_OP_SRC_BASE}/rms_norm/op_kernel/rms_norm.cpp + rms_norm/op_kernel/rms_norm.cpp + add_rms_norm/op_kernel/add_rms_norm.cpp ) +# The kernel translation units include `"data_type_enum.h"` from the main +# project's `src/` so that launcher and device code share one `DataType` +# enum. `ascendc_library` forwards the interface target's `INCLUDES` +# property to the nested `ExternalProject_Add` (see +# `${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake/legacy_modules/function.cmake`), +# so append the main `src/` dir here. +set_property(TARGET no_workspace_kernel_interface APPEND PROPERTY + INCLUDES ${MAIN_SRC_DIR}) + # Create the shared library `libascend_kernel.so`. add_library(${OP_PLUGIN_NAME} SHARED ${OP_SRCS}) diff --git a/src/native/ascend/custom/build.sh b/src/native/ascend/custom/build.sh index 258a88e4..f7e1cf68 100755 --- a/src/native/ascend/custom/build.sh +++ b/src/native/ascend/custom/build.sh @@ -1,5 +1,10 @@ #!/bin/bash -# Build custom `AscendC` kernels into `libascend_kernel.so`. +# Build custom `AscendC` kernels into `libno_workspace_kernel.a` (+ the +# standalone `libascend_kernel.so`). +# +# Intermediate artifacts default to `/build/build_ascend_custom/` +# so the source tree under `src/` stays free of build output. Override +# via `BUILD_DIR= bash build.sh ` if needed. set -e SOC_VERSION="${1:-Ascend910_9382}" @@ -10,20 +15,26 @@ source "${_CANN_TOOLKIT_INSTALL_PATH}/set_env.sh" echo "CANN: ${ASCEND_TOOLKIT_HOME}" ASCEND_INCLUDE_DIR=${ASCEND_TOOLKIT_HOME}/$(arch)-linux/include -CURRENT_DIR=$(pwd) -OUTPUT_DIR=${CURRENT_DIR}/output -mkdir -p "${OUTPUT_DIR}" -BUILD_DIR=build +# Resolve build directory. `