Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,20 @@ option(WITH_ASCEND "Enable Ascend backend" OFF)

option(WITH_TORCH "Enable PyTorch C++ backend" OFF)

# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for
# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed
# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the
# toolchain is compatible or when building via the standalone
# `src/ascend/custom/build.sh` script.
option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires `torch_npu`)" OFF)
# Custom `AscendC` kernels under `src/native/ascend/custom/`. `ON` by default
# so CI and routine dev builds always exercise `implementation_index=1/2`
# for `RmsNorm` / `AddRmsNorm`. Gated by `WITH_ASCEND` in
# `src/CMakeLists.txt`, so non-Ascend builds ignore it. Pass
# `-DBUILD_ASCEND_CUSTOM=OFF` to skip the `ccec` build on Ascend
# machines where the custom kernels aren't needed.
#
# When `ON`, `src/CMakeLists.txt` drives the standalone
# `src/native/ascend/custom/build.sh` via a build-phase custom command. This
# sidesteps a `CANN` `extract_host_stub.py` path bug that breaks in-tree
# `ascendc_library()` under `scikit-build-core` temp-dir builds, then links
# the produced `libno_workspace_kernel.a` into the `ops` module with
# `--whole-archive`. Requires `torch_npu` and the `AscendC` toolchain (`ccec`).
option(BUILD_ASCEND_CUSTOM "Build custom AscendC kernels" ON)

option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF)
Expand Down
78 changes: 74 additions & 4 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,70 @@ if(WITH_ASCEND)
list(APPEND DEVICE_LIST "ascend")

# Custom `AscendC` kernels (PyTorch extension, requires `torch_npu`).
if(BUILD_CUSTOM_KERNEL)
add_subdirectory(native/ascend/custom)
if(BUILD_ASCEND_CUSTOM)
# In-tree `ascendc_library()` trips the `CANN` `extract_host_stub.py`
# path-handling bug under `scikit-build-core`'s temp-dir builds
# (`KeyError` on `/./workspace/...` paths in `$<TARGET_OBJECTS>`).
# Work around it by driving the standalone `src/native/ascend/custom/build.sh`;
# that script invokes a separate `cmake` with
# `src/native/ascend/custom/` as its `SOURCE_DIR`, avoiding the buggy
# path shape. The produced `.a` is imported and linked into
# `ops` with `--whole-archive`.
set(_custom_build_dir "${CMAKE_SOURCE_DIR}/build/build_ascend_custom")
set(_custom_lib "${_custom_build_dir}/lib/libno_workspace_kernel.a")
set(_custom_source_dir "${CMAKE_CURRENT_BINARY_DIR}/ascend_custom_source")

if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "")
include(${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom/cmake/detect_soc.cmake)
infiniops_detect_soc(SOC_VERSION)
endif()

# Drive `build.sh` as a build-phase target with explicit source
# dependencies so that editing any `op_host/` or `op_kernel/`
# source re-triggers the build (plain `execute_process` at
# configure time would only gate on file existence and leave
# stale `.a` files in place).
file(GLOB_RECURSE _custom_srcs CONFIGURE_DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom/*.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom/*.h"
"${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom/build.sh")

# Scrub env inherited from the outer `scikit-build-core` invocation
# before handing control to `build.sh`: `CMAKE_GENERATOR` /
# `CMAKE_EXPORT_COMPILE_COMMANDS` leaking into the inner `cmake`
# change the path format passed to `ninja`'s `_host_cpp` rule and
# re-trigger the `CANN` `extract_host_stub.py` `KeyError`
# (`/./workspace/...`) that standalone `build.sh` avoids.
#
# `pip install` MUST be invoked with `--no-build-isolation` on
# Ascend; otherwise pip's build-isolation overlay shadows system
# `torch` (via `PYTHONPATH`) and the inner `cmake`'s
# `import torch` in `config_envs.cmake` fails with
# `ModuleNotFoundError`.
add_custom_command(
OUTPUT ${_custom_lib}
COMMAND ${CMAKE_COMMAND} -E rm -f "${_custom_source_dir}"
COMMAND ${CMAKE_COMMAND} -E create_symlink
"${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom"
"${_custom_source_dir}"
COMMAND ${CMAKE_COMMAND} -E env
--unset=CMAKE_GENERATOR
--unset=CMAKE_EXPORT_COMPILE_COMMANDS
--unset=CMAKE_BUILD_PARALLEL_LEVEL
"BUILD_DIR=${_custom_build_dir}"
"MAIN_SRC_DIR=${CMAKE_CURRENT_SOURCE_DIR}"
bash ${_custom_source_dir}/build.sh ${SOC_VERSION}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/native/ascend/custom
DEPENDS ${_custom_srcs}
COMMENT "Building custom AscendC kernels (SOC_VERSION=${SOC_VERSION})"
VERBATIM)

add_custom_target(no_workspace_kernel_build ALL DEPENDS ${_custom_lib})

add_library(no_workspace_kernel STATIC IMPORTED GLOBAL)
set_target_properties(no_workspace_kernel PROPERTIES
IMPORTED_LOCATION "${_custom_lib}")
add_dependencies(no_workspace_kernel no_workspace_kernel_build)

# Link the compiled `AscendC` kernel objects into `infiniops` so that
# custom kernel implementations (e.g. `RmsNorm` index 1) can call
Expand Down Expand Up @@ -379,9 +441,17 @@ if(GENERATE_PYTHON_BINDINGS)
# The `Operator<..., 1>` template instantiations that call
# `aclrtlaunch_*` live in `ops.cc`, so link here with
# `--whole-archive` to ensure all launch functions are available.
if(BUILD_CUSTOM_KERNEL)
# `$<TARGET_FILE>` works for both real `ascendc_library()` targets and
# `IMPORTED` targets pointing at a pre-built `.a`. The
# `no_workspace_kernel` target is only created inside the
# `WITH_ASCEND` block above, so this branch must mirror that gate;
# otherwise non-Ascend builds error out with "No target
# no_workspace_kernel".
if(WITH_ASCEND AND BUILD_ASCEND_CUSTOM)
target_link_libraries(ops PRIVATE
-Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
-Wl,--whole-archive $<TARGET_FILE:no_workspace_kernel> -Wl,--no-whole-archive)
# `ops` link step must wait for `build.sh` to produce the `.a`.
add_dependencies(ops no_workspace_kernel_build)
endif()

set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
Expand Down
22 changes: 17 additions & 5 deletions src/native/ascend/custom/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ else()
endif()

set(PROJECT_OP_SRC_BASE ${PROJECT_SOURCE_DIR})
set(PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/build)
set(PROJECT_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/output)
set(MAIN_SRC_DIR "${PROJECT_OP_SRC_BASE}/../.." CACHE PATH
"Main InfiniOps source directory.")

include(cmake/config_envs.cmake)
include(cmake/config_ascend.cmake)
Expand All @@ -43,13 +43,15 @@ if(CCACHE_PROGRAM)
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
endif()

# Shared library output location.
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_PATH})
# `CMAKE_LIBRARY_OUTPUT_DIRECTORY` is set by `build.sh` so that the
# standalone `libascend_kernel.so` lands next to `libno_workspace_kernel.a`
# under `<repo>/build/build_ascend_custom/output/`.

# Host-side files.
file(GLOB OP_SRCS
${PROJECT_OP_SRC_BASE}/torch_binding.cpp
${PROJECT_OP_SRC_BASE}/rms_norm/op_host/rms_norm.cpp
${PROJECT_OP_SRC_BASE}/add_rms_norm/op_host/add_rms_norm.cpp
)

# Shared library name — consumed by `kernel_custom.h` variants and by the
Expand All @@ -58,9 +60,19 @@ set(OP_PLUGIN_NAME ascend_kernel)

# Kernel-side files (device code compiled by the `AscendC` toolchain).
ascendc_library(no_workspace_kernel STATIC
${PROJECT_OP_SRC_BASE}/rms_norm/op_kernel/rms_norm.cpp
rms_norm/op_kernel/rms_norm.cpp
add_rms_norm/op_kernel/add_rms_norm.cpp
)

# The kernel translation units include `"data_type_enum.h"` from the main
# project's `src/` so that launcher and device code share one `DataType`
# enum. `ascendc_library` forwards the interface target's `INCLUDES`
# property to the nested `ExternalProject_Add` (see
# `${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake/legacy_modules/function.cmake`),
# so append the main `src/` dir here.
set_property(TARGET no_workspace_kernel_interface APPEND PROPERTY
INCLUDES ${MAIN_SRC_DIR})

# Create the shared library `libascend_kernel.so`.
add_library(${OP_PLUGIN_NAME} SHARED ${OP_SRCS})

Expand Down
25 changes: 18 additions & 7 deletions src/native/ascend/custom/build.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
#!/bin/bash
# Build custom `AscendC` kernels into `libascend_kernel.so`.
# Build custom `AscendC` kernels into `libno_workspace_kernel.a` (+ the
# standalone `libascend_kernel.so`).
#
# Intermediate artifacts default to `<repo>/build/build_ascend_custom/`
# so the source tree under `src/` stays free of build output. Override
# via `BUILD_DIR=<abs-path> bash build.sh <soc>` if needed.
set -e

SOC_VERSION="${1:-Ascend910_9382}"
Expand All @@ -10,20 +15,26 @@ source "${_CANN_TOOLKIT_INSTALL_PATH}/set_env.sh"
echo "CANN: ${ASCEND_TOOLKIT_HOME}"

ASCEND_INCLUDE_DIR=${ASCEND_TOOLKIT_HOME}/$(arch)-linux/include
CURRENT_DIR=$(pwd)
OUTPUT_DIR=${CURRENT_DIR}/output
mkdir -p "${OUTPUT_DIR}"

BUILD_DIR=build
# Resolve build directory. `<script>/../../..` is `<repo>/`.
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
REAL_SCRIPT_DIR=$(cd "$(dirname "$(readlink -f "$0")")" && pwd)
REPO_ROOT=$(cd "${REAL_SCRIPT_DIR}/../../.." && pwd)
BUILD_DIR="${BUILD_DIR:-${REPO_ROOT}/build/build_ascend_custom}"
OUTPUT_DIR="${BUILD_DIR}/output"
MAIN_SRC_DIR="${MAIN_SRC_DIR:-${REPO_ROOT}/src}"

rm -rf "${BUILD_DIR}"
mkdir -p "${BUILD_DIR}"
mkdir -p "${BUILD_DIR}" "${OUTPUT_DIR}"

cmake \
-DASCEND_HOME_PATH="${ASCEND_HOME_PATH}" \
-DASCEND_INCLUDE_DIR="${ASCEND_INCLUDE_DIR}" \
-DSOC_VERSION="${SOC_VERSION}" \
-DCMAKE_LIBRARY_OUTPUT_DIRECTORY="${OUTPUT_DIR}" \
-DMAIN_SRC_DIR="${MAIN_SRC_DIR}" \
-B "${BUILD_DIR}" \
-S .
-S "${SCRIPT_DIR}"

cmake --build "${BUILD_DIR}" -j 16

Expand Down
14 changes: 3 additions & 11 deletions src/native/ascend/custom/cmake/config_ascend.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,9 @@ set(ASCEND_CANN_PACKAGE_PATH ${ASCEND_HOME_PATH})
# Auto-detect `SOC_VERSION` from `npu-smi info` if not set externally.
# Required by `CANN`'s `ascendc.cmake` for `AscendC` kernel compilation.
if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "")
execute_process(
COMMAND bash -c "npu-smi info 2>/dev/null | awk '/910B|910A|310/ {for (i=1;i<=NF;i++) if ($i ~ /^(910|310)/) {print \"Ascend\" $i; exit}}'"
OUTPUT_VARIABLE _DETECTED_SOC
OUTPUT_STRIP_TRAILING_WHITESPACE)

if(_DETECTED_SOC)
set(SOC_VERSION "${_DETECTED_SOC}" CACHE STRING "Ascend SOC version" FORCE)
else()
set(SOC_VERSION "Ascend910B4" CACHE STRING "Ascend SOC version" FORCE)
endif()

include(${CMAKE_CURRENT_LIST_DIR}/detect_soc.cmake)
infiniops_detect_soc(_detected_soc)
set(SOC_VERSION "${_detected_soc}" CACHE STRING "Ascend SOC version" FORCE)
message(STATUS "SOC_VERSION auto-set to ${SOC_VERSION}")
endif()

Expand Down
24 changes: 24 additions & 0 deletions src/native/ascend/custom/cmake/detect_soc.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Auto-detect the Ascend SOC version from `npu-smi info`.
#
# `infiniops_detect_soc(<out_var>)` parses the first `910*` / `310*` entry
# in `npu-smi info` and writes `Ascend<NNNX>` into the named variable in
# the caller's scope. Falls back to `Ascend910B4` when detection fails
# (no NPU on the host, `npu-smi` missing, output format mismatch).
#
# Called from both `src/CMakeLists.txt` (outer `pip install` build, to
# forward `SOC_VERSION` to the standalone `build.sh` invocation) and
# `src/native/ascend/custom/cmake/config_ascend.cmake` (the sub-build driven
# by that `build.sh`).

function(infiniops_detect_soc out_var)
execute_process(
COMMAND bash -c "npu-smi info 2>/dev/null | awk '/910B|910A|310/ {for (i=1;i<=NF;i++) if ($i ~ /^(910|310)/) {print \"Ascend\" $i; exit}}'"
OUTPUT_VARIABLE _detected
OUTPUT_STRIP_TRAILING_WHITESPACE)

if(_detected)
set(${out_var} "${_detected}" PARENT_SCOPE)
else()
set(${out_var} "Ascend910B4" PARENT_SCOPE)
endif()
endfunction()
Loading