From d4fd805dda7c60a2c09983a9cd5aa1b04d9477d1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Feb 2026 09:57:04 -0800 Subject: [PATCH 01/50] Rename folder dpctl to dpctl_ext --- .../tensor/libtensor/include/kernels/alignment.hpp | 0 .../tensor/libtensor/include/kernels/dpctl_tensor_types.hpp | 0 .../libtensor/include/kernels/elementwise_functions/common.hpp | 0 .../include/kernels/elementwise_functions/common_detail.hpp | 0 .../include/kernels/elementwise_functions/logaddexp.hpp | 0 .../libtensor/include/kernels/elementwise_functions/maximum.hpp | 0 .../libtensor/include/kernels/elementwise_functions/minimum.hpp | 0 .../include/kernels/elementwise_functions/sycl_complex.hpp | 0 .../include/kernels/elementwise_functions/vec_size_util.hpp | 0 .../tensor/libtensor/include/utils/indexing_utils.hpp | 0 .../tensor/libtensor/include/utils/math_utils.hpp | 0 .../tensor/libtensor/include/utils/memory_overlap.hpp | 0 .../tensor/libtensor/include/utils/offset_utils.hpp | 0 .../tensor/libtensor/include/utils/output_validation.hpp | 0 .../tensor/libtensor/include/utils/strided_iters.hpp | 0 .../tensor/libtensor/include/utils/sycl_alloc_utils.hpp | 0 .../tensor/libtensor/include/utils/sycl_utils.hpp | 0 .../tensor/libtensor/include/utils/type_dispatch.hpp | 0 .../tensor/libtensor/include/utils/type_dispatch_building.hpp | 0 .../tensor/libtensor/include/utils/type_utils.hpp | 0 dpnp/backend/extensions/blas/CMakeLists.txt | 2 +- dpnp/backend/extensions/fft/CMakeLists.txt | 2 +- dpnp/backend/extensions/indexing/CMakeLists.txt | 2 +- dpnp/backend/extensions/lapack/CMakeLists.txt | 2 +- dpnp/backend/extensions/statistics/CMakeLists.txt | 2 +- dpnp/backend/extensions/ufunc/CMakeLists.txt | 2 +- dpnp/backend/extensions/vm/CMakeLists.txt | 2 +- dpnp/backend/extensions/window/CMakeLists.txt | 2 +- 28 files changed, 8 insertions(+), 8 deletions(-) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/alignment.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/indexing_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/math_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/memory_overlap.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/offset_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/output_validation.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/strided_iters.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_alloc_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch_building.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_utils.hpp (100%) diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/alignment.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/indexing_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/math_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/memory_overlap.hpp rename to dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/offset_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/output_validation.hpp rename to dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/strided_iters.hpp rename to dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/sycl_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/type_dispatch.hpp rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/type_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt index 0015eda84843..cbc3e31d923b 100644 --- a/dpnp/backend/extensions/blas/CMakeLists.txt +++ b/dpnp/backend/extensions/blas/CMakeLists.txt @@ -68,7 +68,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt index 0569ecc8bca4..edc7bff7dce4 100644 --- a/dpnp/backend/extensions/fft/CMakeLists.txt +++ b/dpnp/backend/extensions/fft/CMakeLists.txt @@ -61,7 +61,7 @@ target_include_directories( ${python_module_name} PRIVATE ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt index c0de75ae3146..39f68ffba846 100644 --- a/dpnp/backend/extensions/indexing/CMakeLists.txt +++ b/dpnp/backend/extensions/indexing/CMakeLists.txt @@ -65,7 +65,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index 76b25c3a6d10..59499a3b28f8 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -86,7 +86,7 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt index e04279b75e49..8544e816e8d6 100644 --- a/dpnp/backend/extensions/statistics/CMakeLists.txt +++ b/dpnp/backend/extensions/statistics/CMakeLists.txt @@ -70,7 +70,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt index 55a750f8423f..293cef0ab326 100644 --- a/dpnp/backend/extensions/ufunc/CMakeLists.txt +++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt @@ -88,7 +88,7 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt index 32d6a6765a00..551c43842af2 100644 --- a/dpnp/backend/extensions/vm/CMakeLists.txt +++ b/dpnp/backend/extensions/vm/CMakeLists.txt @@ -110,7 +110,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt index 6fe04e334f42..01274317782d 100644 --- a/dpnp/backend/extensions/window/CMakeLists.txt +++ b/dpnp/backend/extensions/window/CMakeLists.txt @@ -66,7 +66,7 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) From c040713d50cd10c628990b628cb74b0a5029f99b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:04:36 -0800 Subject: [PATCH 02/50] Add simplify_iteration_space implementation to libtensor --- .../source/simplify_iteration_space.cpp | 544 ++++++++++++++++++ .../source/simplify_iteration_space.hpp | 130 +++++ 2 files changed, 674 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp new file mode 100644 index 000000000000..2526f022e0ac --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp @@ -0,0 +1,544 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "simplify_iteration_space.hpp" +#include "utils/strided_iters.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace py = pybind11; + +void simplify_iteration_space_1(int &nd, + const py::ssize_t *const &shape, + std::vector const &strides, + // output + std::vector &simplified_shape, + std::vector &simplified_strides, + py::ssize_t &offset) +{ + using dpctl::tensor::strides::simplify_iteration_stride; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + + simplified_strides.reserve(nd); + simplified_strides.insert(std::end(simplified_strides), + std::begin(strides), std::end(strides)); + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + int contracted_nd = simplify_iteration_stride( + nd, simplified_shape.data(), simplified_strides.data(), + offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + + simplified_strides.reserve(nd); + simplified_strides.push_back((strides[0] >= 0) ? strides[0] + : -strides[0]); + if ((strides[0] < 0) && (shape[0] > 1)) { + offset += (shape[0] - 1) * strides[0]; + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space(int &nd, + const py::ssize_t *const &shape, + std::vector const &src_strides, + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_two_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::begin(simplified_shape), shape, + shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src_strides.reserve(nd); + simplified_src_strides.insert(std::end(simplified_src_strides), + std::begin(src_strides), + std::end(src_strides)); + assert(simplified_src_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_two_strides( + nd, simplified_shape.data(), simplified_src_strides.data(), + simplified_dst_strides.data(), + src_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if (src_strides[0] < 0 && dst_strides[0] < 0) { + simplified_src_strides.push_back(-src_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src_offset += (shape[0] - 1) * src_strides[0]; + dst_offset += (shape[0] - 1) * dst_strides[0]; + } + } + else { + simplified_src_strides.push_back(src_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space_3( + int &nd, + const py::ssize_t *const &shape, + // src1 + std::vector const &src1_strides, + // src2 + std::vector const &src2_strides, + // dst + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src1_strides, + std::vector &simplified_src2_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src1_offset, + py::ssize_t &src2_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_three_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src1_strides.insert(std::end(simplified_src1_strides), + std::begin(src1_strides), + std::end(src1_strides)); + assert(simplified_src1_strides.size() == static_cast(nd)); + + simplified_src2_strides.reserve(nd); + simplified_src2_strides.insert(std::end(simplified_src2_strides), + std::begin(src2_strides), + std::end(src2_strides)); + assert(simplified_src2_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_three_strides( + nd, simplified_shape.data(), simplified_src1_strides.data(), + simplified_src2_strides.data(), simplified_dst_strides.data(), + src1_offset, // modified by reference + src2_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src1_strides.resize(contracted_nd); + simplified_src2_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src1_offset = 0; + src2_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if ((src1_strides[0] < 0) && (src2_strides[0] < 0) && + (dst_strides[0] < 0)) { + simplified_src1_strides.push_back(-src1_strides[0]); + simplified_src2_strides.push_back(-src2_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src1_offset += src1_strides[0] * (shape[0] - 1); + src2_offset += src2_strides[0] * (shape[0] - 1); + dst_offset += dst_strides[0] * (shape[0] - 1); + } + } + else { + simplified_src1_strides.push_back(src1_strides[0]); + simplified_src2_strides.push_back(src2_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space_4( + int &nd, + const py::ssize_t *const &shape, + // src1 + std::vector const &src1_strides, + // src2 + std::vector const &src2_strides, + // src3 + std::vector const &src3_strides, + // dst + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src1_strides, + std::vector &simplified_src2_strides, + std::vector &simplified_src3_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src1_offset, + py::ssize_t &src2_offset, + py::ssize_t &src3_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_four_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src1_strides.insert(std::end(simplified_src1_strides), + std::begin(src1_strides), + std::end(src1_strides)); + assert(simplified_src1_strides.size() == static_cast(nd)); + + simplified_src2_strides.reserve(nd); + simplified_src2_strides.insert(std::end(simplified_src2_strides), + std::begin(src2_strides), + std::end(src2_strides)); + assert(simplified_src2_strides.size() == static_cast(nd)); + + simplified_src3_strides.reserve(nd); + simplified_src3_strides.insert(std::end(simplified_src3_strides), + std::begin(src3_strides), + std::end(src3_strides)); + assert(simplified_src3_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_four_strides( + nd, simplified_shape.data(), simplified_src1_strides.data(), + simplified_src2_strides.data(), simplified_src3_strides.data(), + simplified_dst_strides.data(), + src1_offset, // modified by reference + src2_offset, // modified by reference + src3_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src1_strides.resize(contracted_nd); + simplified_src2_strides.resize(contracted_nd); + simplified_src3_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src1_offset = 0; + src2_offset = 0; + src3_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_src3_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if ((src1_strides[0] < 0) && (src2_strides[0] < 0) && + (src3_strides[0] < 0) && (dst_strides[0] < 0)) + { + simplified_src1_strides.push_back(-src1_strides[0]); + simplified_src2_strides.push_back(-src2_strides[0]); + simplified_src3_strides.push_back(-src3_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src1_offset += src1_strides[0] * (shape[0] - 1); + src2_offset += src2_strides[0] * (shape[0] - 1); + src3_offset += src3_strides[0] * (shape[0] - 1); + dst_offset += dst_strides[0] * (shape[0] - 1); + } + } + else { + simplified_src1_strides.push_back(src1_strides[0]); + simplified_src2_strides.push_back(src2_strides[0]); + simplified_src3_strides.push_back(src3_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_src3_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void compact_iteration_space(int &nd, + const py::ssize_t *const &shape, + std::vector const &strides, + // output + std::vector &compact_shape, + std::vector &compact_strides) +{ + using dpctl::tensor::strides::compact_iteration; + if (nd > 1) { + // Compact iteration space to reduce dimensionality + // and improve access pattern + compact_shape.reserve(nd); + compact_shape.insert(std::begin(compact_shape), shape, shape + nd); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.insert(std::end(compact_strides), std::begin(strides), + std::end(strides)); + assert(compact_strides.size() == static_cast(nd)); + + int contracted_nd = + compact_iteration(nd, compact_shape.data(), compact_strides.data()); + compact_shape.resize(contracted_nd); + compact_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + // Populate vectors + compact_shape.reserve(nd); + compact_shape.push_back(shape[0]); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.push_back(strides[0]); + assert(compact_strides.size() == static_cast(nd)); + } +} + +/* @brief Split shape/strides into dir1 (complementary to axis_start <= i < + * axis_end) and dir2 (along given set of axes) + */ +void split_iteration_space(const std::vector &shape_vec, + const std::vector &strides_vec, + int axis_start, + int axis_end, + std::vector &dir1_shape_vec, + std::vector &dir2_shape_vec, + std::vector &dir1_strides_vec, + std::vector &dir2_strides_vec) +{ + int nd = static_cast(shape_vec.size()); + int dir2_sz = axis_end - axis_start; + int dir1_sz = nd - dir2_sz; + + assert(dir1_sz > 0); + assert(dir2_sz > 0); + + dir1_shape_vec.resize(dir1_sz); + dir2_shape_vec.resize(dir2_sz); + + std::copy(shape_vec.begin(), shape_vec.begin() + axis_start, + dir1_shape_vec.begin()); + std::copy(shape_vec.begin() + axis_end, shape_vec.end(), + dir1_shape_vec.begin() + axis_start); + + std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end, + dir2_shape_vec.begin()); + + dir1_strides_vec.resize(dir1_sz); + dir2_strides_vec.resize(dir2_sz); + + std::copy(strides_vec.begin(), strides_vec.begin() + axis_start, + dir1_strides_vec.begin()); + std::copy(strides_vec.begin() + axis_end, strides_vec.end(), + dir1_strides_vec.begin() + axis_start); + + std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end, + dir2_strides_vec.begin()); + + return; +} + +py::ssize_t _ravel_multi_index_c(std::vector const &mi, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + if (nd != mi.size()) { + throw py::value_error( + "Multi-index and shape vectors must have the same length."); + } + + py::ssize_t flat_index = 0; + py::ssize_t s = 1; + for (std::size_t i = 0; i < nd; ++i) { + flat_index += mi.at(nd - 1 - i) * s; + s *= shape.at(nd - 1 - i); + } + + return flat_index; +} + +py::ssize_t _ravel_multi_index_f(std::vector const &mi, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + if (nd != mi.size()) { + throw py::value_error( + "Multi-index and shape vectors must have the same length."); + } + + py::ssize_t flat_index = 0; + py::ssize_t s = 1; + for (std::size_t i = 0; i < nd; ++i) { + flat_index += mi.at(i) * s; + s *= shape.at(i); + } + + return flat_index; +} + +std::vector _unravel_index_c(py::ssize_t flat_index, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + std::vector mi; + mi.resize(nd); + + py::ssize_t i_ = flat_index; + for (std::size_t dim = 0; dim + 1 < nd; ++dim) { + const py::ssize_t si = shape[nd - 1 - dim]; + const py::ssize_t q = i_ / si; + const py::ssize_t r = (i_ - q * si); + mi[nd - 1 - dim] = r; + i_ = q; + } + if (nd) { + mi[0] = i_; + } + return mi; +} + +std::vector _unravel_index_f(py::ssize_t flat_index, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + std::vector mi; + mi.resize(nd); + + py::ssize_t i_ = flat_index; + for (std::size_t dim = 0; dim + 1 < nd; ++dim) { + const py::ssize_t si = shape[dim]; + const py::ssize_t q = i_ / si; + const py::ssize_t r = (i_ - q * si); + mi[dim] = r; + i_ = q; + } + if (nd) { + mi[nd - 1] = i_; + } + return mi; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp new file mode 100644 index 000000000000..d3448ee1f5fd --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp @@ -0,0 +1,130 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace py = pybind11; + +void simplify_iteration_space_1(int &, + const py::ssize_t *const &, + std::vector const &, + std::vector &, + std::vector &, + py::ssize_t &); + +void simplify_iteration_space(int &, + const py::ssize_t *const &, + std::vector const &, + std::vector const &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &); + +void simplify_iteration_space_3(int &, + const py::ssize_t *const &, + // src1 + std::vector const &, + // src2 + std::vector const &, + // dst + std::vector const &, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &); + +void simplify_iteration_space_4(int &, + const py::ssize_t *const &, + // src1 + std::vector const &, + // src2 + std::vector const &, + // src3 + std::vector const &, + // dst + std::vector const &, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &); + +void compact_iteration_space(int &, + const py::ssize_t *const &, + std::vector const &, + // output + std::vector &, + std::vector &); + +void split_iteration_space(const std::vector &, + const std::vector &, + int, + int, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &); + +py::ssize_t _ravel_multi_index_c(std::vector const &, + std::vector const &); +py::ssize_t _ravel_multi_index_f(std::vector const &, + std::vector const &); +std::vector _unravel_index_c(py::ssize_t, + std::vector const &); +std::vector _unravel_index_f(py::ssize_t, + std::vector const &); +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From 14b466facfe6b23f92113ccc2dbb224e2727bf3c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:14:43 -0800 Subject: [PATCH 03/50] Extend codespell ignore list for libtensor --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cdf592535d11..67fb75cb5f54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314'] [tool.codespell] builtin = "clear,rare,informal,names" check-filenames = true -ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT" +ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT" quiet-level = 3 [tool.coverage.report] From dcc421bc61c36549d3e6865927f495abab15d078 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:15:09 -0800 Subject: [PATCH 04/50] Add copy_and_cast kernels to libtensor --- .../include/kernels/copy_and_cast.hpp | 1288 +++++++++++++++++ .../include/kernels/copy_as_contiguous.hpp | 655 +++++++++ .../libtensor/source/copy_as_contig.cpp | 758 ++++++++++ .../libtensor/source/copy_as_contig.hpp | 61 + 4 files changed, 2762 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp new file mode 100644 index 000000000000..a07d311a7fcb --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -0,0 +1,1288 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copy_and_cast +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +using dpctl::tensor::sycl_utils::sub_group_load; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +class copy_cast_generic_kernel; + +template +class copy_cast_contig_kernel; + +template +class copy_cast_from_host_kernel; + +template +class copy_cast_from_host_contig_kernel; + +template +class Caster +{ +public: + Caster() = default; + dstTy operator()(const srcTy &src) const + { + using dpctl::tensor::type_utils::convert_impl; + return convert_impl(src); + } +}; + +template +class GenericCopyFunctor +{ +private: + const srcT *src_ = nullptr; + dstT *dst_ = nullptr; + IndexerT indexer_; + +public: + GenericCopyFunctor(const srcT *src_p, dstT *dst_p, const IndexerT &indexer) + : src_(src_p), dst_(dst_p), indexer_(indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const ssize_t &src_offset = offsets.get_first_offset(); + const ssize_t &dst_offset = offsets.get_second_offset(); + + static constexpr CastFnT fn{}; + dst_[dst_offset] = fn(src_[src_offset]); + } +}; + +/*! + @defgroup CopyAndCastKernels + */ + +/*! + * @brief Function pointer type for generic array cast and copying function. + */ +typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +/*! + * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to + `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have array dimensionality specified via argument `nd`. The + `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the + first `nd` elements encode common shape, second `nd` elements contain strides + of `src` array, and the trailing `nd` elements contain strides of `dst` array. + `src_p` and `dst_p` represent pointers into respective arrays, but the start of + iteration begins at offset of `src_offset` elements for `src` array and at + offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue + `q` with events `depends` and `additional_depends` as dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param nd Array dimensionality, i.e. number of indices needed to + identify an element of each array. + @param shape_and_strides Kernel accessible USM pointer to packed shape and + strides. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of + elements of source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of + elements of destination array from `dst_p`. + @param depends List of events to wait for before starting computations, if + any. + @param additional_depends Additional list of events to wait for before + starting computations, if any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_generic_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + const TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset, + shape_and_strides}; + const srcTy *src_tp = reinterpret_cast(src_p); + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFunctor, + TwoOffsets_StridedIndexer>(src_tp, dst_tp, + indexer)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get generic function pointer of type `fnT` for given source + * data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCastGenericFactory +{ + fnT get() + { + fnT f = copy_and_cast_generic_impl; + return f; + } +}; + +// Specialization of copy_and_cast for contiguous arrays + +template +class ContigCopyFunctor +{ +private: + std::size_t nelems; + const srcT *src_p = nullptr; + dstT *dst_p = nullptr; + +public: + ContigCopyFunctor(const std::size_t nelems_, + const srcT *src_p_, + dstT *dst_p_) + : nelems(nelems_), src_p(src_p_), dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static constexpr CastFnT fn{}; + + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + + using dpctl::tensor::type_utils::is_complex_v; + if constexpr (!enable_sg_loadstore || is_complex_v || + is_complex_v) { + std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + // start = (gid / sgSize) * elems_per_sg + (gid % sgSize) + const std::uint16_t elems_per_sg = sgSize * elems_per_wi; + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { + dst_p[offset] = fn(src_p[offset]); + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems) { + sycl::vec dst_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto src_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&src_p[offset]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[offset]); + + const sycl::vec src_vec = + sub_group_load(sg, src_multi_ptr); +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; k++) { + dst_vec[k] = fn(src_vec[k]); + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t start = base + sg.get_local_id()[0]; + for (std::size_t k = start; k < nelems; k += sgSize) { + dst_p[k] = fn(src_p[k]); + } + } + } + } +}; + +/*! + * @brief Function pointer type for contiguous array cast and copy function. + */ +typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + const std::vector &); + +/*! + * @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray + to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have the same number of elements `nelems`. + `src_cp` and `dst_cp` represent char pointers to the start of respective + arrays. Kernel is submitted to sycl queue `q` with events `depends` as + dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param src_p Kernel accessible USM pointer for the source array + @param dst_p Kernel accessible USM pointer for the destination array + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_contig_impl(sycl::queue &q, + std::size_t nelems, + const char *src_cp, + char *dst_cp, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const srcTy *src_tp = reinterpret_cast(src_cp); + dstTy *dst_tp = reinterpret_cast(dst_cp); + + std::size_t lws = 64; + static constexpr std::uint32_t vec_sz = 4; + static constexpr std::uint32_t n_vecs = 2; + const std::size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + if (is_aligned(src_cp) && + is_aligned(dst_cp)) + { + static constexpr bool enable_sg_loadstore = true; + using KernelName = + copy_cast_contig_kernel; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs, enable_sg_loadstore>(nelems, src_tp, + dst_tp)); + } + else { + static constexpr bool disable_sg_loadstore = false; + using InnerKernelName = + copy_cast_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs, disable_sg_loadstore>(nelems, src_tp, + dst_tp)); + } + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get specialized function pointer for casting and copying + * contiguous arrays. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCastContigFactory +{ + fnT get() + { + fnT f = copy_and_cast_contig_impl; + return f; + } +}; + +// Specialization of copy_and_cast for 1D arrays + +/*! + * @brief Factory to get function pointer for casting and copying 1D arrays. + * @ingroup CopyAndCastKernels + */ +typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const std::array &, + const std::array &, + const std::array &, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +/*! + * @brief Factory to get function pointer for casting and copying 2D arrays. + * @ingroup CopyAndCastKernels + */ +typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const std::array &, + const std::array &, + const std::array &, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +/*! + * @brief Specialized for given array dimension function to copy `nelems` + elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy` + to `dstTy`. + + Both arrays have array dimensionality known at compile time and specified in + template parameters `nd`. Arrays' shape and strides are provided as + `std::array`. `src_p` and `dst_p` represent pointers into respective arrays, + but the start of iteration begins at offset of `src_offset` elements for `src` + array and at offset `dst_offset` elements for `dst` array. Kernel is submitted + to sycl queue `q` with events `depends` as dependencies. + + @param q The queue where the routine should be executed. + @param nelems Number of elements to cast and copy. + @param shape Common shape of the arrays. + @param src_strides Strides of the source array. + @param dst_strides Strides of the destination array. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of elements + of the source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of elements + of the destination array from `src_p`. + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_nd_specialized_impl( + sycl::queue &q, + std::size_t nelems, + const std::array &shape, + const std::array &src_strides, + const std::array &dst_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + using IndexerT = TwoOffsets_FixedDimStridedIndexer; + const IndexerT indexer{shape, src_strides, dst_strides, src_offset, + dst_offset}; + const srcTy *src_tp = reinterpret_cast(src_p); + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.depends_on(depends); + cgh.parallel_for< + class copy_cast_generic_kernel>( + sycl::range<1>(nelems), + GenericCopyFunctor, IndexerT>( + src_tp, dst_tp, indexer)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get 1D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCast1DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +/*! + * @brief Factory to get 2D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCast2DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +// ====================== Copying from host to USM + +template +class GenericCopyFromHostFunctor +{ +private: + AccessorT src_acc_; + dstTy *dst_ = nullptr; + IndexerT indexer_; + +public: + GenericCopyFromHostFunctor(const AccessorT &src_acc, + dstTy *dst_p, + const IndexerT &indexer) + : src_acc_(src_acc), dst_(dst_p), indexer_(indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const ssize_t &src_offset = offsets.get_first_offset(); + const ssize_t &dst_offset = offsets.get_second_offset(); + + CastFnT fn{}; + dst_[dst_offset] = fn(src_acc_[src_offset]); + } +}; + +typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + ssize_t, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +/*! + * @brief Function to copy from NumPy's ndarray with elements of type `srcTy` + * into usm_ndarray with elements of type `srcTy`. + * + * Function to cast and copy elements from numpy.ndarray specified by typeless + * `host_src_p` and the `src_offset` given in the number of array elements. + * Arrays' metadata are given in packed USM vector of length `3*nd` whose first + * `nd` elements contain arrays' shape, next `nd` elements specify source + * strides in elements (not bytes), and trailing `nd` elements specify + * destination array strides. Kernel dependencies are given by two vectors of + * events: `depends` and `additional_depends`. The function execution is + * complete at the return. + * + * @param q The queue where the routine should be executed. + * @param nelems Number of elements to cast and copy. + * @param nd The dimensionality of arrays + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides. + * @param host_src_p Host (not USM allocated) pointer associated with the + * source array. + * @param src_offset Offset to the beginning of iteration in number of elements + * of the source array from `host_src_p`. + * @param src_min_nelem_offset Smallest value of offset relative to + * `host_src_p` in number of elements attained while iterating over elements of + * the source array. + * @param src_max_nelem_offset Largest value of offset relative to `host_src_p` + * in number of elements attained while iterating over elements of the source + * array. + * @param dst_p USM pointer associated with the destination array. + * @param dst_offset Offset to the beginning of iteration in number of elements + * of the destination array from `dst_p`. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @ingroup CopyAndCastKernels + */ +template +void copy_and_cast_from_host_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *host_src_p, + ssize_t src_offset, + ssize_t src_min_nelem_offset, + ssize_t src_max_nelem_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1; + + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::buffer npy_buf( + reinterpret_cast(host_src_p) + src_min_nelem_offset, + sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}}); + + sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); + + const TwoOffsets_StridedIndexer indexer{ + nd, src_offset - src_min_nelem_offset, dst_offset, + const_cast(shape_and_strides)}; + + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFromHostFunctor, + TwoOffsets_StridedIndexer>( + npy_acc, dst_tp, indexer)); + }); + + // perform explicit synchronization. Implicit synchronization would be + // performed by sycl::buffer destructor. + copy_and_cast_from_host_ev.wait(); + + return; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given NumPy array + * source data type `S` and destination data type `D`. + * @defgroup CopyAndCastKernels + */ +template +struct CopyAndCastFromHostFactory +{ + fnT get() + { + fnT f = copy_and_cast_from_host_impl; + return f; + } +}; + +typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)( + sycl::queue &, + std::size_t, /* nelems */ + const char *, /* src_pointer */ + ssize_t, /* src_offset */ + char *, /* dst_pointer */ + ssize_t, /* dst_offset */ + const std::vector &); + +/*! + * @brief Function to copy from NumPy's ndarray with elements of type `srcTy` + * into usm_ndarray with elements of type `srcTy` for contiguous arrays. + * + * Function to cast and copy elements from numpy.ndarray specified by typeless + * `host_src_p` and the `src_offset` given in the number of array elements. + * Kernel dependencies are given by two vectors of + * events: `depends` and `additional_depends`. The function execution is + * complete at the return. + * + * @param q The queue where the routine should be executed. + * @param nelems Number of elements to cast and copy. + * @param src_stride The stride of source array in elements + * @param dst_stride The stride of destimation array in elements + * @param host_src_p Host (not USM allocated) pointer associated with the + * source array. + * @param src_offset Offset to the beginning of iteration in number of elements + * of the source array from `host_src_p`. + * @param dst_p USM pointer associated with the destination array. + * @param dst_offset Offset to the beginning of iteration in number of elements + * of the destination array from `dst_p`. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @ingroup CopyAndCastKernels + */ +template +void copy_and_cast_from_host_contig_impl( + sycl::queue &q, + std::size_t nelems, + const char *host_src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::buffer npy_buf( + reinterpret_cast(host_src_p) + src_offset, + sycl::range<1>(nelems), {sycl::property::buffer::use_host_ptr{}}); + + sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); + + using IndexerT = TwoOffsets_CombinedIndexer; + static constexpr NoOpIndexer src_indexer{}; + static constexpr NoOpIndexer dst_indexer{}; + static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer, + dst_indexer}; + + dstTy *dst_tp = reinterpret_cast(dst_p) + dst_offset; + + cgh.parallel_for< + copy_cast_from_host_contig_kernel>( + sycl::range<1>(nelems), + GenericCopyFromHostFunctor, IndexerT>( + npy_acc, dst_tp, indexer)); + }); + + // perform explicit synchronization. Implicit synchronization would be + // performed by sycl::buffer destructor. + copy_and_cast_from_host_ev.wait(); + + return; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given NumPy array + * source data type `S` and destination data type `D`. + * @defgroup CopyAndCastKernels + */ +template +struct CopyAndCastFromHostContigFactory +{ + fnT get() + { + fnT f = copy_and_cast_from_host_contig_impl; + return f; + } +}; + +// =============== Copying for reshape ================== // + +template +class copy_for_reshape_generic_kernel; + +template +class GenericCopyForReshapeFunctor +{ +private: + const Ty *src_p = nullptr; + Ty *dst_p = nullptr; + SrcIndexerT src_indexer_; + DstIndexerT dst_indexer_; + +public: + GenericCopyForReshapeFunctor(const char *src_ptr, + char *dst_ptr, + const SrcIndexerT &src_indexer, + const DstIndexerT &dst_indexer) + : src_p(reinterpret_cast(src_ptr)), + dst_p(reinterpret_cast(dst_ptr)), src_indexer_(src_indexer), + dst_indexer_(dst_indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const ssize_t src_offset = src_indexer_(wiid.get(0)); + const ssize_t dst_offset = dst_indexer_(wiid.get(0)); + + dst_p[dst_offset] = src_p[src_offset]; + } +}; + +// define function type +typedef sycl::event (*copy_for_reshape_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + int, // src_nd + int, // dst_nd + const ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to copy content of array while reshaping. + * + * Submits a kernel to perform a copy `dst[unravel_index(i, + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param nelems The number of elements to copy + * @param src_nd Array dimension of the source array + * @param dst_nd Array dimension of the destination array + * @param packed_shapes_and_strides Kernel accessible USM array of size + * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape, + * dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event + copy_for_reshape_generic_impl(sycl::queue &q, + std::size_t nelems, + int src_nd, + int dst_nd, + const ssize_t *packed_shapes_and_strides, + const char *src_p, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides: + // USM array of size 2*(src_nd + dst_nd) + // [ src_shape; src_strides; dst_shape; dst_strides ] + + const ssize_t *src_shape_and_strides = + const_cast(packed_shapes_and_strides); + + const ssize_t *dst_shape_and_strides = const_cast( + packed_shapes_and_strides + (2 * src_nd)); + + const StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides}; + const StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides}; + + using KernelName = + copy_for_reshape_generic_kernel; + + cgh.parallel_for( + sycl::range<1>(nelems), + GenericCopyForReshapeFunctor( + src_p, dst_p, src_indexer, dst_indexer)); + }); + + return copy_for_reshape_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForReshapeGenericFactory +{ + fnT get() + { + fnT f = copy_for_reshape_generic_impl; + return f; + } +}; + +// ================== Copying for roll ================== // + +/*! @brief Functor to cyclically roll global_id to the left */ +struct LeftRolled1DTransformer +{ + LeftRolled1DTransformer(std::size_t offset, std::size_t size) + : offset_(offset), size_(size) + { + } + + std::size_t operator()(std::size_t gid) const + { + const std::size_t shifted_gid = + ((gid < offset_) ? gid + size_ - offset_ : gid - offset_); + return shifted_gid; + } + +private: + std::size_t offset_ = 0; + std::size_t size_ = 1; +}; + +/*! @brief Indexer functor to compose indexer and transformer */ +template +struct CompositionIndexer +{ + CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {} + + auto operator()(std::size_t gid) const + { + return f_(t_(gid)); + } + +private: + IndexerT f_; + TransformerT t_; +}; + +/*! @brief Indexer functor to find offset for nd-shifted indices lifted from + * iteration id */ +struct RolledNDIndexer +{ + RolledNDIndexer(int nd, + const ssize_t *shape, + const ssize_t *strides, + const ssize_t *ndshifts, + ssize_t starting_offset) + : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts), + starting_offset_(starting_offset) + { + } + + ssize_t operator()(std::size_t gid) const + { + return compute_offset(gid); + } + +private: + int nd_ = -1; + const ssize_t *shape_ = nullptr; + const ssize_t *strides_ = nullptr; + const ssize_t *ndshifts_ = nullptr; + ssize_t starting_offset_ = 0; + + ssize_t compute_offset(ssize_t gid) const + { + using dpctl::tensor::strides::CIndexer_vector; + + CIndexer_vector _ind(nd_); + ssize_t relative_offset_(0); + _ind.get_left_rolled_displacement( + gid, + shape_, // shape ptr + strides_, // strides ptr + ndshifts_, // shifts ptr + relative_offset_); + return starting_offset_ + relative_offset_; + } +}; + +template +class copy_for_roll_strided_kernel; + +template +class StridedCopyForRollFunctor +{ +private: + const Ty *src_p = nullptr; + Ty *dst_p = nullptr; + SrcIndexerT src_indexer_; + DstIndexerT dst_indexer_; + +public: + StridedCopyForRollFunctor(const Ty *src_ptr, + Ty *dst_ptr, + const SrcIndexerT &src_indexer, + const DstIndexerT &dst_indexer) + : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer), + dst_indexer_(dst_indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const std::size_t gid = wiid.get(0); + + const ssize_t src_offset = src_indexer_(gid); + const ssize_t dst_offset = dst_indexer_(gid); + + dst_p[dst_offset] = src_p[src_offset]; + } +}; + +// define function type +typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)( + sycl::queue &, + std::size_t, // shift + std::size_t, // num_elements + int, // common_nd + const ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +/*! + * @brief Function to copy content of array with a shift. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing, must be non-negative. + * @param nelems The number of elements to copy + * @param nd Array dimensionality of the destination and source arrays + * @param packed_shapes_and_strides Kernel accessible USM array + * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param src_offset Displacement of first element of src relative src_p in + * elements + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param dst_offset Displacement of first element of dst relative dst_p in + * elements + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_for_roll_strided_impl(sycl::queue &q, + std::size_t shift, + std::size_t nelems, + int nd, + const ssize_t *packed_shapes_and_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides: + // USM array of size 3 * nd + // [ common_shape; src_strides; dst_strides ] + + const StridedIndexer src_indexer{nd, src_offset, + packed_shapes_and_strides}; + const LeftRolled1DTransformer left_roll_transformer{shift, nelems}; + + using CompositeIndexerT = + CompositionIndexer; + + const CompositeIndexerT rolled_src_indexer(src_indexer, + left_roll_transformer); + + UnpackedStridedIndexer dst_indexer{nd, dst_offset, + packed_shapes_and_strides, + packed_shapes_and_strides + 2 * nd}; + + using KernelName = copy_for_roll_strided_kernel; + + const Ty *src_tp = reinterpret_cast(src_p); + Ty *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor( + src_tp, dst_tp, rolled_src_indexer, dst_indexer)); + }); + + return copy_for_roll_ev; +} + +// define function type +typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)( + sycl::queue &, + std::size_t, // shift + std::size_t, // num_elements + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +template +class copy_for_roll_contig_kernel; + +/*! + * @brief Function to copy content of array with a shift. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing, must be non-negative. + * @param nelems The number of elements to copy + * @param src_p Typeless USM pointer to the buffer of the source array + * @param src_offset Displacement of the start of array src relative src_p in + * elements + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param dst_offset Displacement of the start of array dst relative dst_p in + * elements + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_for_roll_contig_impl(sycl::queue &q, + std::size_t shift, + std::size_t nelems, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + static constexpr NoOpIndexer src_indexer{}; + const LeftRolled1DTransformer roller{shift, nelems}; + + const CompositionIndexer + left_rolled_src_indexer{src_indexer, roller}; + static constexpr NoOpIndexer dst_indexer{}; + + using KernelName = copy_for_roll_contig_kernel; + + const Ty *src_tp = reinterpret_cast(src_p) + src_offset; + Ty *dst_tp = reinterpret_cast(dst_p) + dst_offset; + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor< + Ty, CompositionIndexer, + NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer, + dst_indexer)); + }); + + return copy_for_roll_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollStridedFactory +{ + fnT get() + { + fnT f = copy_for_roll_strided_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollContigFactory +{ + fnT get() + { + fnT f = copy_for_roll_contig_impl; + return f; + } +}; + +template +class copy_for_roll_ndshift_strided_kernel; + +// define function type +typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + int, // common_nd + const ssize_t *, // packed shape, strides, shifts + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +template +sycl::event copy_for_roll_ndshift_strided_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *packed_shapes_and_strides_and_shifts, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides_and_shifts: + // USM array of size 4 * nd + // [ common_shape; src_strides; dst_strides; shifts ] + + const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts; + const ssize_t *src_strides_ptr = + packed_shapes_and_strides_and_shifts + nd; + const ssize_t *dst_strides_ptr = + packed_shapes_and_strides_and_shifts + 2 * nd; + const ssize_t *shifts_ptr = + packed_shapes_and_strides_and_shifts + 3 * nd; + + const RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr, + shifts_ptr, src_offset}; + + const UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr, + dst_strides_ptr}; + + using KernelName = copy_for_roll_strided_kernel; + + const Ty *src_tp = reinterpret_cast(src_p); + Ty *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor( + src_tp, dst_tp, src_indexer, dst_indexer)); + }); + + return copy_for_roll_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollNDShiftFactory +{ + fnT get() + { + fnT f = copy_for_roll_ndshift_strided_impl; + return f; + } +}; + +} // namespace copy_and_cast +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp new file mode 100644 index 000000000000..b4f367448758 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -0,0 +1,655 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copy_as_contig +{ + +using dpctl::tensor::ssize_t; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +class CopyAsCContigFunctor +{ +private: + std::size_t nelems; + const T *src_p = nullptr; + T *dst_p = nullptr; + IndexerT src_indexer; + +public: + CopyAsCContigFunctor(std::size_t n, + const T *src_, + T *dst_, + const IndexerT &src_indexer_) + : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static_assert(vec_sz > 0); + static_assert(n_vecs > 0); + + static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs; + + using dpctl::tensor::type_utils::is_complex; + if constexpr (!enable_sg_loadstore || is_complex::value) { + const std::uint16_t sgSize = + ndit.get_sub_group().get_max_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize) + // gid % sgSize == gid - (gid / sgSize) * sgSize + const std::uint16_t elems_per_sg = sgSize * elems_per_wi; + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + elems_per_sg); + + for (std::size_t offset = start; offset < end; offset += sgSize) { + auto src_offset = src_indexer(offset); + dst_p[offset] = src_p[src_offset]; + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + const std::uint16_t elems_per_sg = elems_per_wi * sgSize; + + if (base + elems_per_sg < nelems) { +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + // it == vec_id * vec_sz, for 0 <= vec_id < n_vecs + const std::size_t block_start_id = base + it * sgSize; + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[block_start_id]); + + const std::size_t elem_id0 = + block_start_id + sg.get_local_id(); + sycl::vec dst_vec; +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; ++k) { + const std::size_t elem_id = elem_id0 + k * sgSize; + const ssize_t src_offset = src_indexer(elem_id); + dst_vec[k] = src_p[src_offset]; + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + const std::size_t k0 = base + lane_id; + for (std::size_t k = k0; k < nelems; k += sgSize) { + const ssize_t src_offset = src_indexer(k); + dst_p[k] = src_p[src_offset]; + } + } + } + } +}; + +template +sycl::event submit_c_contiguous_copy(sycl::queue &exec_q, + std::size_t nelems, + const T *src, + T *dst, + const IndexerT &src_indexer, + const std::vector &depends) +{ + static_assert(vec_sz > 0); + static_assert(n_vecs > 0); + + static constexpr std::size_t preferred_lws = 256; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + const std::size_t lws = + ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size; + + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + + const std::size_t nelems_per_group = nelems_per_wi * lws; + const std::size_t n_groups = + (nelems + nelems_per_group - 1) / (nelems_per_group); + + sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.use_kernel_bundle(kb); + + const sycl::range<1> gRange{n_groups * lws}; + const sycl::range<1> lRange{lws}; + + cgh.parallel_for( + sycl::nd_range<1>(gRange, lRange), + CopyAsCContigFunctor( + nelems, src, dst, src_indexer)); + }); + return copy_ev; +} + +template +class as_contig_krn; + +template +sycl::event + as_c_contiguous_array_generic_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *src_p, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *src_tp = reinterpret_cast(src_p); + T *dst_tp = reinterpret_cast(dst_p); + + using IndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides); + + static constexpr std::uint8_t vec_sz = 4u; + static constexpr std::uint8_t n_vecs = 2u; + + using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; + using dpctl::tensor::kernels::alignment_utils::is_aligned; + using dpctl::tensor::kernels::alignment_utils::required_alignment; + + sycl::event copy_ev; + if (is_aligned(dst_p)) { + static constexpr bool enable_sg_load = true; + using KernelName = + as_contig_krn; + copy_ev = submit_c_contiguous_copy( + exec_q, nelems, src_tp, dst_tp, src_indexer, depends); + } + else { + static constexpr bool disable_sg_load = false; + using InnerKernelName = + as_contig_krn; + using KernelName = disabled_sg_loadstore_wrapper_krn; + copy_ev = submit_c_contiguous_copy( + exec_q, nelems, src_tp, dst_tp, src_indexer, depends); + } + + return copy_ev; +} + +typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + char *, + const std::vector &); + +template +struct AsCContigFactory +{ + fnT get() + { + return as_c_contiguous_array_generic_impl; + } +}; + +template +class as_contig_batch_of_square_matrices_krn; + +namespace detail +{ +/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination + strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks + to avoid race condition + */ +template +sycl::event as_c_contiguous_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + const BatchIndexerT &batch_two_offsets_indexer, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *src_tp = reinterpret_cast(src_p); + T *dst_tp = reinterpret_cast(dst_p); + + static constexpr std::uint16_t private_tile_size = 4; + static constexpr std::uint16_t n_lines = 2; + static constexpr std::uint16_t block_size = + n_lines * private_tile_size * private_tile_size; + + static constexpr std::uint16_t lws0 = block_size; + static constexpr std::uint16_t lws1 = n_lines; + static constexpr std::uint16_t nelems_per_wi = (block_size / lws1); + + static_assert(nelems_per_wi * lws1 == block_size); + static_assert(nelems_per_wi == private_tile_size * private_tile_size); + + static constexpr std::uint32_t lws = lws0 * lws1; + + const std::size_t n_tiles = (n + block_size - 1) / block_size; + + const ssize_t src_stride = src_ld; + const ssize_t dst_stride = dst_ld; + + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws}; + + sycl::nd_range<1> ndRange{gRange, lRange}; + + using KernelName = + as_contig_batch_of_square_matrices_krn; + + sycl::event e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::local_accessor local_block(block_size * block_size, cgh); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> nd_it) { + // 1. Read block from source array into SLM + const std::uint32_t lid_lin = nd_it.get_local_linear_id(); + const std::size_t gr_id_lin = nd_it.get_group_linear_id(); + + const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles); + const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles); + + const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id); + const auto &src_batch_offset = batch_two_offsets.get_first_offset(); + const auto &dst_batch_offset = + batch_two_offsets.get_second_offset(); + + // Block id + /* 0 <= src_gr_i1 < n_groups_n1 */ + const std::size_t src_tile_i1 = rem / n_tiles; + /* 0 <= src_gr_i0 < n_groups_n0 */ + const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles; + + // ID of element within the block + /* 0 <= src_i1 < lws1 */ + const std::uint32_t src_i1 = lid_lin / lws0; + /* 0 <= src_i0 < lws0 */ + const std::uint32_t src_i0 = lid_lin - src_i1 * lws0; + + // Matrix element ID + const std::size_t src_tile_start0 = src_tile_i0 * block_size; + const std::size_t src_tile_start1 = src_tile_i1 * block_size; + const std::size_t src_gid0 = (src_tile_start0 + src_i0); + const std::size_t src_gid1 = (src_tile_start1 + src_i1); + + // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) * + // src_stride + const std::size_t src_offset0 = + src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride; + const std::size_t pr_step_src = lws1 * src_stride; + + const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size; + const std::uint32_t pr_step_local = lws1 * block_size; + + for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) { + local_block[local_offset0 + pr_step_local * pr_id] = + (src_gid0 < n && src_gid1 + pr_id * lws1 < n) + ? src_tp[src_offset0 + pr_step_src * pr_id] + : T(0); + } + + const std::uint32_t local_dim0 = static_cast( + std::min(src_tile_start0 + block_size, n) - + src_tile_start0); + const std::uint32_t local_dim1 = static_cast( + std::min(src_tile_start1 + block_size, n) - + src_tile_start1); + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + // 2. Permute the block matrix in SLM using two private arrays + std::array private_block_01 = {T(0)}; + std::array private_block_10 = {T(0)}; + + // 0 <= lid_lin < lws0 * lws1 == + // (block_size * block_size / nelems_per_wi) == + // (block_size/private_tile_size)**2 + static constexpr std::uint16_t n_private_tiles_per_axis = + block_size / private_tile_size; + const std::uint16_t local_tile_id0 = + lid_lin / n_private_tiles_per_axis; + const std::uint16_t local_tile_id1 = + lid_lin - local_tile_id0 * n_private_tiles_per_axis; + + if (local_tile_id0 <= local_tile_id1) { + for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size; + ++pr_i0) { + for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size; + ++pr_i1) { + const std::uint16_t t0_offset = + local_tile_id0 * private_tile_size; + const std::uint16_t t1_offset = + local_tile_id1 * private_tile_size; + + const std::uint16_t pr_offset = + pr_i1 * private_tile_size + pr_i0; + const std::uint16_t rel_offset = + pr_i0 + pr_i1 * block_size; + + // read (local_tile_id0, local_tile_id1) + const std::uint16_t local_01_offset = + (t0_offset + t1_offset * block_size) + rel_offset; + private_block_01[pr_offset] = + local_block[local_01_offset]; + + // read (local_tile_id1, local_tile_id0) + const std::uint16_t local_10_offset = + (t1_offset + t0_offset * block_size) + rel_offset; + private_block_10[pr_offset] = + local_block[local_10_offset]; + } + } + } + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + if (local_tile_id0 <= local_tile_id1) { + for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size; + ++pr_i0) { + for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size; + ++pr_i1) { + const std::uint16_t t0_offset = + local_tile_id0 * private_tile_size; + const std::uint16_t t1_offset = + local_tile_id1 * private_tile_size; + const std::uint16_t pr_offset = + pr_i0 * private_tile_size + pr_i1; + + const std::uint16_t rel_offset = + pr_i0 + pr_i1 * block_size; + + // write back permuted private blocks + const std::uint32_t local_01_offset = + (t0_offset + t1_offset * block_size) + rel_offset; + local_block[local_01_offset] = + private_block_10[pr_offset]; + + const std::uint16_t local_10_offset = + (t1_offset + t0_offset * block_size) + rel_offset; + local_block[local_10_offset] = + private_block_01[pr_offset]; + } + } + } + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + // 3. Write out permuted SLM to destination array + + const std::size_t dst_tile_start0 = src_tile_start0; + const std::size_t dst_tile_start1 = src_tile_start1; + + if (local_dim0 == block_size && local_dim1 == block_size) { + const std::uint16_t dst_i0 = src_i1; + const std::uint16_t dst_i1 = src_i0; + + const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0); + const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1); + + const std::size_t dst_offset0 = + dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1; + const std::size_t pr_step_dst = lws1 * dst_stride; + + const std::uint16_t _local_offset0 = + dst_i0 * block_size + dst_i1; + const std::uint16_t _pr_step_local = lws1 * block_size; + + for (std::uint16_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) { + if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) { + dst_tp[dst_offset0 + pr_step_dst * pr_id] = + local_block[_local_offset0 + + _pr_step_local * pr_id]; + } + } + } + else { + // map local_linear_id into (local_dim0, local_dim1) + for (std::uint16_t el_id = lid_lin; + el_id < local_dim0 * local_dim1; el_id += lws0 * lws1) + { + + // 0 <= local_i0 < local_dim0 + const std::uint16_t loc_i0 = el_id / local_dim1; + // 0 <= local_i1 < local_dim1 + const std::uint16_t loc_i1 = el_id - loc_i0 * local_dim1; + + const std::uint16_t dst_i0 = loc_i0; + const std::uint16_t dst_i1 = loc_i1; + + const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0); + const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1); + + const std::size_t dst_offset = + dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1; + const std::uint16_t local_offset = + loc_i0 * block_size + loc_i1; + + if ((dst_gid1 < n) && (dst_gid0 < n)) { + dst_tp[dst_offset] = local_block[local_offset]; + } + } + } + }); + }); + + return e; +} + +} // end of namespace detail + +template +sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + ssize_t src_batch_step, + ssize_t dst_batch_step, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using BatchIndexerT = + TwoOffsets_CombinedIndexer; + + const auto &src_batch_indexer = + Strided1DIndexer(batch_nelems, src_batch_step); + const auto &dst_batch_indexer = + Strided1DIndexer(batch_nelems, dst_batch_step); + + const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer}; + + return detail::as_c_contiguous_batch_of_square_matrices_impl( + exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p, + dst_ld, depends); +} + +typedef sycl::event ( + *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)( + sycl::queue &, /* execution queue */ + std::size_t, /* number of batch elements */ + ssize_t, /* distance between batches in source array */ + ssize_t, /* distance between batches in destination array */ + std::size_t, /* size of square matrices in the batch */ + const char *, + ssize_t, /* untyped pointer to F-contig source array, and matrix leading + dimension */ + char *, + ssize_t, /* untyped pointer to C-contig destination array, and matrix + leading dimension */ + const std::vector &); + +template +struct AsCContig1DBatchOfSquareMatricesFactory +{ + fnT get() + { + return as_c_contiguous_1d_batch_of_square_matrices_impl; + } +}; + +template +sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + int batch_nd, + const ssize_t *src_batch_shape_strides, + const ssize_t dst_batch_step, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using BatchIndexerT = TwoOffsets_CombinedIndexer; + + static constexpr ssize_t zero_offset{0}; + + const SrcIndexerT src_batch_indexer{batch_nd, zero_offset, + src_batch_shape_strides}; + const DstIndexerT dst_batch_indexer{/* size */ batch_nelems, + /* step */ dst_batch_step}; + + const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer, + dst_batch_indexer}; + + return detail::as_c_contiguous_batch_of_square_matrices_impl( + exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld, + dst_p, dst_ld, depends); +} + +typedef sycl::event ( + *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)( + sycl::queue &, /* execution queue */ + std::size_t, /* number of matrices in the batch */ + int, + const ssize_t *, /* dimensionality, and packed [shape, src_strides] + describing iteration over batch in source array */ + ssize_t, /* distance between batches in destination array */ + std::size_t, /* matrix size */ + const char *, + ssize_t, /* untyped pointer to source array of F-contig matrices, and + leading dimension of the matrix */ + char *, + ssize_t, /* untyped pointer to destination array of F-contig matrices, and + leading dimension of the matrix */ + const std::vector &); + +template +struct AsCContigNDBatchOfSquareMatricesFactory +{ + fnT get() + { + return as_c_contiguous_nd_batch_of_square_matrices_impl; + } +}; + +} // namespace copy_as_contig +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp new file mode 100644 index 000000000000..53b39ff5874c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp @@ -0,0 +1,758 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/copy_as_contiguous.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_as_contig.hpp" +#include "simplify_iteration_space.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t; +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_array_impl_fn_ptr_t; +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +static as_c_contiguous_array_impl_fn_ptr_t + as_c_contig_array_dispatch_vector[td_ns::num_types]; + +static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t + as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types]; + +static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types]; + +void init_copy_as_contig_dispatch_vectors(void) +{ + + using dpctl::tensor::kernels::copy_as_contig:: + AsCContig1DBatchOfSquareMatricesFactory; + using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory; + using dpctl::tensor::kernels::copy_as_contig:: + AsCContigNDBatchOfSquareMatricesFactory; + using td_ns::DispatchVectorBuilder; + + // Generic to c-contig + DispatchVectorBuilder + dtv_as_c_contig_array; + + dtv_as_c_contig_array.populate_dispatch_vector( + as_c_contig_array_dispatch_vector); + + // 1D batch of square views into F-contig matrices to c-contig array + DispatchVectorBuilder< + as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t, + AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types> + dtv_as_c_contig_1d_batch_of_square_matrices; + + dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector( + as_c_contig_1d_batch_of_square_matrices_dispatch_vector); + + // ND batch of square views into F-contig matrices to c-contig array + DispatchVectorBuilder< + as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t, + AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types> + dtv_as_c_contig_nd_batch_of_square_matrices; + + dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector( + as_c_contig_nd_batch_of_square_matrices_dispatch_vector); +} + +namespace +{ + +template +std::size_t get_nelems(const std::vector &shape) +{ + auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t { + return prod * static_cast(term); + }; + + static constexpr std::size_t unit{1}; + + const std::size_t nelems = + std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn); + return nelems; +} + +} // end of anonymous namespace + +std::pair + py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +std::pair + py_as_c_contig(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is C-contiguous. + */ + const int src_nd = src.get_ndim(); + const int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + if (src_shape_vec != dst_shape_vec) { + throw py::value_error("Shapes must be equal"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_c_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_nd >= 2) { + auto n = dst_shape_vec.back(); + if (n == dst_shape_vec[src_nd - 2]) { + static constexpr auto unit_stride = py::ssize_t(1); + if (src_strides_vec[src_nd - 2] == unit_stride) { + return py_as_c_contig_f2c(src, dst, exec_q, depends); + } + } + } + + const std::size_t nelems = get_nelems(src_shape_vec); + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // simplify iteration space + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(), + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + std::vector host_task_events{}; + auto ptr_size_event_tuple = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); + + auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), + dst.get_data(), all_depends); + + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +std::pair + py_as_f_contig(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is F-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + if (src_shape_vec != dst_shape_vec) { + throw py::value_error("Shapes must be equal"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_f_contiguous()) { + throw py::value_error("Destination array must be F-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_nd >= 2) { + auto n = dst_shape_vec.front(); + if (n == dst_shape_vec[1]) { + static constexpr auto unit_stride = py::ssize_t(1); + if (src_strides_vec[1] == unit_stride) { + return py_as_f_contig_c2f(src, dst, exec_q, depends); + } + } + } + + const std::size_t nelems = get_nelems(src_shape_vec); + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // simplify batch iteration space + // NB: simplification reverses dst strides to C contig, + // it also reverses simplified_shape and simplified_src_strides + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(), + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + std::vector host_task_events{}; + auto ptr_size_event_tuple = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); + + auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), + dst.get_data(), all_depends); + + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is C-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same."); + } + if (src_nd < 2) { + throw py::value_error("Arrays must have 2 or more axes"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + std::size_t nelems{1}; + bool equal_shapes = true; + + for (int i = 0; equal_shapes && (i < src_nd); ++i) { + auto sh_i = src_shape_vec[i]; + equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]); + nelems *= static_cast(sh_i); + } + + if (!equal_shapes) { + throw py::value_error("Shapes must be equal"); + } + + const auto n = src_shape_vec.back(); + if (src_shape_vec[src_nd - 2] != n) { + throw py::value_error("Matrices must be square"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) { + throw py::value_error("Unexpected destination array layout"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_c_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + const auto &dst_strides_vec = dst.get_strides_vector(); + + const std::size_t batch_nelems = + (src_nd == 2) ? std::size_t(1) : (nelems / (n * n)); + const py::ssize_t dst_batch_step = + (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3]; + + std::vector src_batch_strides_vec; + std::vector dst_batch_strides_vec; + std::vector batch_shape_vec; + + if (src_nd == 2) { + batch_shape_vec.push_back(py::ssize_t(1)); + src_batch_strides_vec.push_back(py::ssize_t(0)); + dst_batch_strides_vec.push_back(dst_batch_step); + } + else { + batch_shape_vec.insert(std::end(batch_shape_vec), + std::begin(src_shape_vec), + std::end(src_shape_vec) - 2); + src_batch_strides_vec.insert(std::end(src_batch_strides_vec), + std::begin(src_strides_vec), + std::end(src_strides_vec) - 2); + dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec), + std::begin(dst_strides_vec), + std::end(dst_strides_vec) - 2); + } + + // simplify batch iteration space + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = static_cast(batch_shape_vec.size()); + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, batch_shape_vec.data(), src_batch_strides_vec, + dst_batch_strides_vec, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + if (1 == nd) { + const auto expected_dim = static_cast(batch_nelems); + if ((simplified_shape.front() != expected_dim) || + (simplified_dst_strides.front() != dst_batch_step)) + { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 2"); + } + + auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector + [src_type_id]; + const py::ssize_t src_batch_step = simplified_src_strides.front(); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n, + src.get_data(), src_strides_vec.back(), dst.get_data(), + dst_strides_vec[src_nd - 2], depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev); + } + + auto impl_fn = + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step, + n, src.get_data(), src_strides_vec.back(), dst.get_data(), + dst_strides_vec[src_nd - 2], all_depends); + + // async free of shape_strides temporary + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is F-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same."); + } + if (src_nd < 2) { + throw py::value_error("Arrays must have 2 or more axes"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_f_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + std::size_t nelems{1}; + bool equal_shapes = true; + + for (int i = 0; equal_shapes && (i < src_nd); ++i) { + auto sh_i = src_shape_vec[i]; + equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]); + nelems *= static_cast(sh_i); + } + + if (!equal_shapes) { + throw py::value_error("Shapes must be equal"); + } + + const auto n = dst_shape_vec.front(); + if (dst_shape_vec[1] != n) { + throw py::value_error("Matrices must be square"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_strides_vec[1] != py::ssize_t(1)) { + throw py::value_error("Unexpected destination array layout"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + const auto &dst_strides_vec = dst.get_strides_vector(); + + const std::size_t batch_nelems = + (src_nd == 2) ? std::size_t(1) : (nelems / (n * n)); + const py::ssize_t dst_batch_step = + (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2]; + + std::vector src_batch_strides_vec; + std::vector dst_batch_strides_vec; + std::vector batch_shape_vec; + + if (src_nd == 2) { + batch_shape_vec.push_back(py::ssize_t(1)); + src_batch_strides_vec.push_back(py::ssize_t(0)); + dst_batch_strides_vec.push_back(dst_batch_step); + } + else { + batch_shape_vec.insert(std::end(batch_shape_vec), + std::begin(src_shape_vec) + 2, + std::end(src_shape_vec)); + src_batch_strides_vec.insert(std::end(src_batch_strides_vec), + std::begin(src_strides_vec) + 2, + std::end(src_strides_vec)); + dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec), + std::begin(dst_strides_vec) + 2, + std::end(dst_strides_vec)); + } + + // simplify batch iteration space + // NB: simplification reverses dst strides to C contig, + // it also reverses simplified_shape and simplified_src_strides + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = static_cast(batch_shape_vec.size()); + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, batch_shape_vec.data(), src_batch_strides_vec, + dst_batch_strides_vec, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + if (1 == nd) { + const auto expected_dim = static_cast(batch_nelems); + if ((simplified_shape.front() != expected_dim) || + (simplified_dst_strides.front() != dst_batch_step)) + { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 2"); + } + + auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector + [src_type_id]; + const py::ssize_t src_batch_step = simplified_src_strides.front(); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n, + src.get_data(), src_strides_vec.front(), dst.get_data(), + dst_strides_vec[1], depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev); + } + + auto impl_fn = + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step, + n, src.get_data(), src_strides_vec.front(), dst.get_data(), + dst_strides_vec[1], all_depends); + + // async free of shape_strides + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp new file mode 100644 index 000000000000..2de67098b7fa --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp @@ -0,0 +1,61 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +std::pair + py_as_c_contig(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +std::pair + py_as_f_contig(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +void init_copy_as_contig_dispatch_vectors(void); + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl From 5a9c14cd5ac07cf0a79da70e67b1cd9c28f063c6 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:16:36 -0800 Subject: [PATCH 05/50] Add copy_usm_ndarray_into_usm_ndarray implementation --- .../source/copy_and_cast_usm_to_usm.cpp | 310 ++++++++++++++++++ .../source/copy_and_cast_usm_to_usm.hpp | 60 ++++ 2 files changed, 370 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp new file mode 100644 index 000000000000..0458aa75ac32 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -0,0 +1,310 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include +#include + +#include "kernels/copy_and_cast.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "copy_as_contig.hpp" +#include "simplify_iteration_space.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_contig_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t; + +static copy_and_cast_generic_fn_ptr_t + copy_and_cast_generic_dispatch_table[td_ns::num_types][td_ns::num_types]; +static copy_and_cast_1d_fn_ptr_t + copy_and_cast_1d_dispatch_table[td_ns::num_types][td_ns::num_types]; +static copy_and_cast_contig_fn_ptr_t + copy_and_cast_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +namespace py = pybind11; + +using dpctl::utils::keep_args_alive; + +std::pair copy_usm_ndarray_into_usm_ndarray( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}) +{ + // array dimensions must be the same + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Array dimensions are not the same."); + } + + // shapes must be the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; shapes_equal && (i < src_nd); ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + // TODO: could use a temporary, but this is done by the caller + throw py::value_error("Arrays index overlapping segments of memory"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + // check for applicability of special cases: + // (both C-contiguous || both F-contiguous) + bool both_c_contig = (is_src_c_contig && is_dst_c_contig); + bool both_f_contig = (is_src_f_contig && is_dst_f_contig); + if (both_c_contig || both_f_contig) { + + sycl::event copy_ev; + if (src_type_id == dst_type_id) { + + int src_elem_size = src.get_elemsize(); + + copy_ev = exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); + } + else { + auto contig_fn = + copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id]; + copy_ev = + contig_fn(exec_q, src_nelems, src_data, dst_data, depends); + } + // make sure src and dst are not GC-ed before copy_ev is complete + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + if ((src_type_id == dst_type_id) && (src_nd > 1)) { + if (is_dst_c_contig) { + return py_as_c_contig(src, dst, exec_q, depends); + } + else if (is_dst_f_contig) { + return py_as_f_contig(src, dst, exec_q, depends); + } + } + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (nd < 2) { + if (nd == 1) { + std::array shape_arr = {simplified_shape[0]}; + std::array src_strides_arr = { + simplified_src_strides[0]}; + std::array dst_strides_arr = { + simplified_dst_strides[0]}; + + sycl::event copy_and_cast_1d_event; + if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) && + (src_offset == 0) && (dst_offset == 0)) + { + auto contig_fn = + copy_and_cast_contig_dispatch_table[dst_type_id] + [src_type_id]; + copy_and_cast_1d_event = + contig_fn(exec_q, src_nelems, src_data, dst_data, depends); + } + else { + auto fn = + copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + copy_and_cast_1d_event = + fn(exec_q, src_nelems, shape_arr, src_strides_arr, + dst_strides_arr, src_data, src_offset, dst_data, + dst_offset, depends); + } + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}), + copy_and_cast_1d_event); + } + else if (nd == 0) { // case of a scalar + assert(src_nelems == 1); + std::array shape_arr = {1}; + std::array src_strides_arr = {1}; + std::array dst_strides_arr = {1}; + + auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + + sycl::event copy_and_cast_0d_event = fn( + exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, + src_data, src_offset, dst_data, dst_offset, depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}), + copy_and_cast_0d_event); + } + } + + // Generic implementation + auto copy_and_cast_fn = + copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + const sycl::event ©_and_cast_generic_ev = copy_and_cast_fn( + exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, + dst_offset, depends, {copy_shape_ev}); + + // async free of shape_strides temporary + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_and_cast_generic_ev}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_and_cast_generic_ev); +} + +void init_copy_and_cast_usm_to_usm_dispatch_tables(void) +{ + using namespace td_ns; + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastContigFactory; + DispatchTableBuilder + dtb_contig; + dtb_contig.populate_dispatch_table(copy_and_cast_contig_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory; + DispatchTableBuilder + dtb_generic; + dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory; + DispatchTableBuilder + dtb_1d; + dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp new file mode 100644 index 000000000000..d2a2dcaf7b85 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp @@ -0,0 +1,60 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair copy_usm_ndarray_into_usm_ndarray( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_and_cast_usm_to_usm_dispatch_tables(); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From 4f6334054fc08df7c2c2f7657bc5f4569ee4363a Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:18:36 -0800 Subject: [PATCH 06/50] Add pybind11 bindings for dpctl_ext.tensor._tensor_impl --- .../tensor/libtensor/source/tensor_ctors.cpp | 502 ++++++++++++++++++ 1 file changed, 502 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp new file mode 100644 index 000000000000..b41b5c9ce423 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -0,0 +1,502 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" + +// #include "accumulators.hpp" +// #include "boolean_advanced_indexing.hpp" +// #include "clip.hpp" +#include "copy_and_cast_usm_to_usm.hpp" +#include "copy_as_contig.hpp" +// #include "copy_for_reshape.hpp" +// #include "copy_for_roll.hpp" +// #include "copy_numpy_ndarray_into_usm_ndarray.hpp" +// #include "device_support_queries.hpp" +// #include "eye_ctor.hpp" +// #include "full_ctor.hpp" +// #include "integer_advanced_indexing.hpp" +#include "kernels/dpctl_tensor_types.hpp" +// #include "linear_sequences.hpp" +// #include "repeat.hpp" +#include "simplify_iteration_space.hpp" +// #include "triul_ctor.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/strided_iters.hpp" +// #include "where.hpp" +// #include "zeros_ctor.hpp" + +namespace py = pybind11; + +static_assert(std::is_same_v); + +namespace +{ + +using dpctl::tensor::c_contiguous_strides; +using dpctl::tensor::f_contiguous_strides; + +using dpctl::tensor::overlap::MemoryOverlap; +using dpctl::tensor::overlap::SameLogicalTensors; + +using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray; +using dpctl::tensor::py_internal::py_as_c_contig; +using dpctl::tensor::py_internal::py_as_f_contig; + +/* =========================== Copy for reshape ============================= */ + +// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; + +/* =========================== Copy for roll ============================= */ + +// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; +// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; + +/* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ + +// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; + +/* ============= linear-sequence ==================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; + +/* ================ Full ================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_full; + +/* ================ Zeros ================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_zeros; + +/* ============== Advanced Indexing ============= */ +// using dpctl::tensor::py_internal::usm_ndarray_put; +// using dpctl::tensor::py_internal::usm_ndarray_take; + +// using dpctl::tensor::py_internal::py_extract; +// using dpctl::tensor::py_internal::py_mask_positions; +// using dpctl::tensor::py_internal::py_nonzero; +// using dpctl::tensor::py_internal::py_place; + +/* ================= Repeat ====================*/ +// using dpctl::tensor::py_internal::py_cumsum_1d; +// using dpctl::tensor::py_internal::py_repeat_by_scalar; +// using dpctl::tensor::py_internal::py_repeat_by_sequence; + +/* ================ Eye ================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_eye; + +/* =========================== Tril and triu ============================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_triul; + +/* =========================== Where ============================== */ + +// using dpctl::tensor::py_internal::py_where; + +/* =========================== Clip ============================== */ +// using dpctl::tensor::py_internal::py_clip; + +// populate dispatch tables +void init_dispatch_tables(void) +{ + using namespace dpctl::tensor::py_internal; + + init_copy_and_cast_usm_to_usm_dispatch_tables(); + // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); + // init_advanced_indexing_dispatch_tables(); + // init_where_dispatch_tables(); + return; +} + +// populate dispatch vectors +void init_dispatch_vectors(void) +{ + using namespace dpctl::tensor::py_internal; + + init_copy_as_contig_dispatch_vectors(); + // init_copy_for_reshape_dispatch_vectors(); + // init_copy_for_roll_dispatch_vectors(); + // init_linear_sequences_dispatch_vectors(); + // init_full_ctor_dispatch_vectors(); + // init_zeros_ctor_dispatch_vectors(); + // init_eye_ctor_dispatch_vectors(); + // init_triul_ctor_dispatch_vectors(); + + // populate_masked_extract_dispatch_vectors(); + // populate_masked_place_dispatch_vectors(); + + // populate_mask_positions_dispatch_vectors(); + + // populate_cumsum_1d_dispatch_vectors(); + // init_repeat_dispatch_vectors(); + + // init_clip_dispatch_vectors(); + + return; +} + +} // namespace + +PYBIND11_MODULE(_tensor_impl, m) +{ + init_dispatch_tables(); + init_dispatch_vectors(); + + using dpctl::tensor::strides::contract_iter; + m.def( + "_contract_iter", &contract_iter, + "Simplifies iteration of array of given shape & stride. Returns " + "a triple: shape, stride and offset for the new iterator of possible " + "smaller dimension, which traverses the same elements as the original " + "iterator, possibly in a different order."); + + m.def("_copy_usm_ndarray_into_usm_ndarray", + ©_usm_ndarray_into_usm_ndarray, + "Copies from usm_ndarray `src` into usm_ndarray `dst` of the same " + "shape. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_as_c_contig", &py_as_c_contig, + "Copies from usm_ndarray `src` into C-contiguous usm_ndarray " + "`dst` of the same shape and the same data type. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_as_f_contig", &py_as_f_contig, + "Copies from usm_ndarray `src` into F-contiguous usm_ndarray " + "`dst` of the same shape and the same data type. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + using dpctl::tensor::strides::contract_iter2; + m.def( + "_contract_iter2", &contract_iter2, + "Simplifies iteration over elements of pair of arrays of given shape " + "with strides stride1 and stride2. Returns " + "a 5-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + using dpctl::tensor::strides::contract_iter3; + m.def( + "_contract_iter3", &contract_iter3, + "Simplifies iteration over elements of 3-tuple of arrays of given " + "shape " + "with strides stride1, stride2, and stride3. Returns " + "a 7-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + using dpctl::tensor::strides::contract_iter4; + m.def( + "_contract_iter4", &contract_iter4, + "Simplifies iteration over elements of 4-tuple of arrays of given " + "shape " + "with strides stride1, stride2, stride3, and stride4. Returns " + "a 9-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + static constexpr char orderC = 'C'; + m.def( + "_ravel_multi_index", + [](const std::vector &mi, + const std::vector &shape, char order = 'C') { + if (order == orderC) { + return dpctl::tensor::py_internal::_ravel_multi_index_c(mi, + shape); + } + else { + return dpctl::tensor::py_internal::_ravel_multi_index_f(mi, + shape); + } + }, + ""); + + m.def( + "_unravel_index", + [](py::ssize_t flat_index, const std::vector &shape, + char order = 'C') { + if (order == orderC) { + return dpctl::tensor::py_internal::_unravel_index_c(flat_index, + shape); + } + else { + return dpctl::tensor::py_internal::_unravel_index_f(flat_index, + shape); + } + }, + ""); + + // m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, + // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same + // " "number of elements using underlying 'C'-contiguous order for + // flat " "traversal. " "Returns a tuple of events: (ht_event, + // comp_event)", py::arg("src"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, + // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same + // " "shapes using underlying 'C'-contiguous order for flat " + // "traversal with shift. " + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("src"), py::arg("dst"), py::arg("shift"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, + // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same + // " "shapes using underlying 'C'-contiguous order for " "traversal + // with shifts along each axis. " "Returns a tuple of events: + // (ht_event, comp_event)", py::arg("src"), py::arg("dst"), + // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = + // py::list()); + + // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + // "specified by " + // "starting point `start` and step `dt`. " + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("start"), py::arg("dt"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + // "specified by " + // "starting point `start` and end point `end`. " + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("start"), py::arg("end"), py::arg("dst"), + // py::arg("include_endpoint"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_copy_numpy_ndarray_into_usm_ndarray", + // ©_numpy_ndarray_into_usm_ndarray, + // "Copy from numpy array `src` into usm_ndarray `dst` + // synchronously.", py::arg("src"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, + // "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_full_usm_ndarray", &usm_ndarray_full, + // "Populate usm_ndarray `dst` with given fill_value.", + // py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_take", &usm_ndarray_take, + // "Takes elements at usm_ndarray indices `ind` and axes starting " + // "at axis `axis_start` from array `src` and copies them " + // "into usm_ndarray `dst` synchronously." + // "Returns a tuple of events: (hev, ev)", + // py::arg("src"), py::arg("ind"), py::arg("dst"), + // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_put", &usm_ndarray_put, + // "Puts elements at usm_ndarray indices `ind` and axes starting " + // "at axis `axis_start` into array `dst` from " + // "usm_ndarray `val` synchronously." + // "Returns a tuple of events: (hev, ev)", + // py::arg("dst"), py::arg("ind"), py::arg("val"), + // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_eye", &usm_ndarray_eye, + // "Fills input 2D contiguous usm_ndarray `dst` with " + // "zeros outside of the diagonal " + // "specified by " + // "the diagonal index `k` " + // "which is filled with ones." + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("k"), py::arg("dst"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("default_device_fp_type", + // dpctl::tensor::py_internal::default_device_fp_type, + // "Gives default floating point type supported by device.", + // py::arg("dev")); + + // m.def("default_device_int_type", + // dpctl::tensor::py_internal::default_device_int_type, + // "Gives default signed integer type supported by device.", + // py::arg("dev")); + + // m.def("default_device_uint_type", + // dpctl::tensor::py_internal::default_device_uint_type, + // "Gives default unsigned integer type supported by device.", + // py::arg("dev")); + + // m.def("default_device_bool_type", + // dpctl::tensor::py_internal::default_device_bool_type, + // "Gives default boolean type supported by device.", py::arg("dev")); + + // m.def("default_device_complex_type", + // dpctl::tensor::py_internal::default_device_complex_type, + // "Gives default complex floating point type supported by device.", + // py::arg("dev")); + + // m.def("default_device_index_type", + // dpctl::tensor::py_internal::default_device_index_type, + // "Gives default index type supported by device.", py::arg("dev")); + + // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + // sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); + // }; + // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), + // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + // sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); + // }; + // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), + // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), + // py::arg("cumsum"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), + // py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto overlap = [](const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2) -> bool { + auto const &overlap = MemoryOverlap(); + return overlap(x1, x2); + }; + m.def("_array_overlap", overlap, + "Determines if the memory regions indexed by each array overlap", + py::arg("array1"), py::arg("array2")); + + // auto same_logical_tensors = + // [](const dpctl::tensor::usm_ndarray &x1, + // const dpctl::tensor::usm_ndarray &x2) -> bool { + // auto const &same_logical_tensors = SameLogicalTensors(); + // return same_logical_tensors(x1, x2); + // }; + // m.def("_same_logical_tensors", same_logical_tensors, + // "Determines if the memory regions indexed by each array are the + // same", py::arg("array1"), py::arg("array2")); + + // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), + // py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"), + // py::arg("mask_shape"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"), + // py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, + // const dpctl::tensor::usm_ndarray &reps, + // const dpctl::tensor::usm_ndarray &cumsum, + // std::optional axis, sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // if (axis) { + // return py_repeat_by_sequence(src, dst, reps, cumsum, + // axis.value(), + // exec_q, depends); + // } + // else { + // return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q, + // depends); + // } + // }; + // m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"), + // py::arg("dst"), py::arg("reps"), py::arg("cumsum"), + // py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") = + // py::list()); + + // auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, + // const py::ssize_t reps, std::optional axis, + // sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // if (axis) { + // return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q, + // depends); + // } + // else { + // return py_repeat_by_scalar(src, dst, reps, exec_q, depends); + // } + // }; + // m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"), + // py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_clip", &py_clip, + // "Clamps elements of array `x` to the range " + // "[`min`, `max] and writes the result to the " + // "array `dst` for each element of `x`, `min`, and `max`." + // "Returns a tuple of events: (hev, ev)", + // py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); +} From 634579c5f0d64d44805d0a020cb4ca5ae1d5e774 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:24:11 -0800 Subject: [PATCH 07/50] Add CMake build files for dpctl_ext --- dpctl_ext/CMakeLists.txt | 205 ++++++++++++++++++++++++++++++++ dpctl_ext/tensor/CMakeLists.txt | 175 +++++++++++++++++++++++++++ 2 files changed, 380 insertions(+) create mode 100644 dpctl_ext/CMakeLists.txt create mode 100644 dpctl_ext/tensor/CMakeLists.txt diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt new file mode 100644 index 000000000000..bb33a4f57332 --- /dev/null +++ b/dpctl_ext/CMakeLists.txt @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +find_package(Python REQUIRED COMPONENTS NumPy) + +# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present) +# -w is to set working directory (and correctly set __pyx_f[] array of filenames) +set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"") +find_package(Cython REQUIRED) + +if(WIN32) + string( + CONCAT WARNING_FLAGS + "-Wall " + "-Wextra " + "-Winit-self " + "-Wunused-function " + "-Wuninitialized " + "-Wmissing-declarations " + "-Wstrict-prototypes " + "-Wno-unused-parameter " + ) + string(CONCAT SDL_FLAGS "/GS " "/DynamicBase ") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}") + set(CMAKE_C_FLAGS_DEBUG + "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_CXX_FLAGS_DEBUG + "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_C_FLAGS_COVERAGE + "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG" + ) + set(CMAKE_CXX_FLAGS_COVERAGE + "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG" + ) + set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}") + set(DPCTL_LDFLAGS "/NXCompat;/DynamicBase") + mark_as_advanced( + CMAKE_CXX_FLAGS_COVERAGE + CMAKE_C_FLAGS_COVERAGE + CMAKE_MODULE_LINKER_FLAGS_COVERAGE + ) +elseif(UNIX) + string( + CONCAT WARNING_FLAGS + "-Wall " + "-Wextra " + "-Winit-self " + "-Wunused-function " + "-Wuninitialized " + "-Wmissing-declarations " + "-Wstrict-prototypes " + "-Wno-unused-parameter " + "-fdiagnostics-color=auto " + ) + string( + CONCAT SDL_FLAGS + "-fstack-protector " + "-fstack-protector-all " + "-fpic " + "-fPIC " + "-D_FORTIFY_SOURCE=2 " + "-Wformat " + "-Wformat-security " + # "-fno-strict-overflow " # no-strict-overflow is implied by -fwrapv + "-fno-delete-null-pointer-checks " + "-fwrapv " + ) + string(CONCAT CFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}") + string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 ${CXXFLAGS}") + set(CMAKE_C_FLAGS_DEBUG + "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_CXX_FLAGS_DEBUG + "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_C_FLAGS_COVERAGE "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O1 -g1 -DDEBUG") + set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O1 -g1 -DDEBUG") + set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}") + set(DPCTL_LDFLAGS "-z,noexecstack,-z,relro,-z,now") + mark_as_advanced( + CMAKE_CXX_FLAGS_COVERAGE + CMAKE_C_FLAGS_COVERAGE + CMAKE_MODULE_LINKER_FLAGS_COVERAGE + ) +else() + message(FATAL_ERROR "Unsupported system.") +endif() + +# at build time create include/ directory and copy header files over +set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) + +set(CMAKE_INSTALL_RPATH "$ORIGIN") + +function(build_dpctl_ext _trgt _src _dest) + set(options SYCL) + cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN}) + add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src) + set(_cythonize_trgt "${_trgt}_cythonize_pyx") + python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src}) + if(BUILD_DPCTL_EXT_SYCL) + add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src}) + target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int) + target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel) + if(DPCTL_OFFLOAD_COMPRESS) + target_link_options(${_trgt} PRIVATE --offload-compress) + endif() + if(_dpctl_sycl_targets) + # make fat binary + target_compile_options( + ${_trgt} + PRIVATE ${_dpctl_sycl_target_compile_options} + ) + target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options}) + endif() + endif() + target_link_libraries(${_trgt} PRIVATE Python::NumPy) + if(DPCTL_GENERATE_COVERAGE) + target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1) + if(BUILD_DPCTL_EXT_SYCL) + target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer) + endif() + endif() + target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface) + set(_linker_options "LINKER:${DPCTL_LDFLAGS}") + target_link_options(${_trgt} PRIVATE ${_linker_options}) + get_filename_component(_name_wle ${_generated_src} NAME_WLE) + get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY) + set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h") + set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h") + + # TODO: create separate folder inside build folder that contains only + # headers related to this target and appropriate folder structure to + # eliminate shadow dependencies + get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY) + # TODO: do not set directory if we did not generate header + target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir}) + set(_rpath_value "$ORIGIN") + if(BUILD_DPCTL_EXT_RELATIVE_PATH) + set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}") + endif() + if(DPCTL_WITH_REDIST) + set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..") + endif() + set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value}) + + install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest}) + install( + FILES ${_generated_api_h} + # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` + DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} + OPTIONAL + ) + install( + FILES ${_generated_public_h} + # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` + DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} + OPTIONAL + ) + if(DPCTL_GENERATE_COVERAGE) + get_filename_component(_original_src_dir ${_src} DIRECTORY) + file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir}) + install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir}) + endif() + + # Create target with headers only, because python is managing all the + # library imports at runtime + set(_trgt_headers ${_trgt}_headers) + add_library(${_trgt_headers} INTERFACE) + add_dependencies(${_trgt_headers} ${_trgt}) + get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES) + target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir}) +endfunction() + +add_subdirectory(tensor) diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt new file mode 100644 index 000000000000..ed8294b76615 --- /dev/null +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +if(WIN32) + if(${CMAKE_VERSION} VERSION_LESS "3.23") + # this is a work-around for target_link_options inserting option after -link option, cause + # linker to ignore it. + set(CMAKE_CXX_LINK_FLAGS + "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel" + ) + endif() +endif() + +set(_static_lib_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp +) +set(_tensor_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp +) + +set(_static_lib_trgt simplify_iteration_space) + +add_library(${_static_lib_trgt} STATIC ${_static_lib_sources}) +target_include_directories( + ${_static_lib_trgt} + PRIVATE + ${Python_INCLUDE_DIRS} + ${DPCTL_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include +) +target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES}) +set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON) + +set(_py_trgts) + +set(python_module_name _tensor_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + +set(_clang_prefix "") +if(WIN32) + set(_clang_prefix "/clang:") +endif() + +set(_no_fast_math_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp +) +list( + APPEND _no_fast_math_sources + # ${_elementwise_sources} + # ${_reduction_sources} + # ${_sorting_sources} + # ${_linalg_sources} + # ${_accumulator_sources} +) + +foreach(_src_fn ${_no_fast_math_sources}) + get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) + set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math") + set_source_files_properties( + ${_src_fn} + PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}" + ) +endforeach() + +set(_compiler_definitions "") + +set(_linker_options "LINKER:${DPCTL_LDFLAGS}") +foreach(python_module_name ${_py_trgts}) + target_compile_options( + ${python_module_name} + PRIVATE -fno-sycl-id-queries-fit-in-int + ) + target_link_options( + ${python_module_name} + PRIVATE -fsycl-device-code-split=per_kernel + ) + if(DPCTL_OFFLOAD_COMPRESS) + target_link_options(${python_module_name} PRIVATE --offload-compress) + endif() + + target_include_directories( + ${python_module_name} + PRIVATE + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${Dpctl_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ + ) + target_link_options(${python_module_name} PRIVATE ${_linker_options}) + if(DPCTL_GENERATE_COVERAGE) + if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS) + target_compile_options( + ${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) + endif() + target_link_options( + ${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) + endif() + if(_dpctl_sycl_targets) + # make fat binary + target_compile_options( + ${python_module_name} + PRIVATE ${_dpctl_sycl_target_compile_options} + ) + target_link_options( + ${python_module_name} + PRIVATE ${_dpctl_sycl_target_link_options} + ) + endif() + # TODO: update source so they reference individual libraries instead of + # dpctl4pybind11.hpp. It will allow to simplify dependency tree + # NOTE: dpctl C-API is resolved at runtime via Python + # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI) + if(DPCTL_WITH_REDIST) + set_target_properties( + ${python_module_name} + PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.." + ) + endif() + # TODO: revert to `DESTINATION "dpctl/tensor"` + install(TARGETS ${python_module_name} DESTINATION "dpctl_ext/tensor") +endforeach() From 79d40f235d10d1b9d514d9db07939d0bb447086c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:31:12 -0800 Subject: [PATCH 08/50] Add empty __init__ to dpctl_ext/ --- dpctl_ext/__init__.py | 27 +++++++++++++++++++++++++++ dpctl_ext/tensor/__init__.py | 27 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 dpctl_ext/__init__.py create mode 100644 dpctl_ext/tensor/__init__.py diff --git a/dpctl_ext/__init__.py b/dpctl_ext/__init__.py new file mode 100644 index 000000000000..a71324cb88d8 --- /dev/null +++ b/dpctl_ext/__init__.py @@ -0,0 +1,27 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py new file mode 100644 index 000000000000..a71324cb88d8 --- /dev/null +++ b/dpctl_ext/tensor/__init__.py @@ -0,0 +1,27 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** From 7949c17c3586a4ad0222c6abbf3a616202834c68 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:53:03 -0800 Subject: [PATCH 09/50] Enable _same_logical_tensors in _tensor_impl --- .../tensor/libtensor/source/tensor_ctors.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index b41b5c9ce423..ca3b7bd49116 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -430,15 +430,15 @@ PYBIND11_MODULE(_tensor_impl, m) "Determines if the memory regions indexed by each array overlap", py::arg("array1"), py::arg("array2")); - // auto same_logical_tensors = - // [](const dpctl::tensor::usm_ndarray &x1, - // const dpctl::tensor::usm_ndarray &x2) -> bool { - // auto const &same_logical_tensors = SameLogicalTensors(); - // return same_logical_tensors(x1, x2); - // }; - // m.def("_same_logical_tensors", same_logical_tensors, - // "Determines if the memory regions indexed by each array are the - // same", py::arg("array1"), py::arg("array2")); + auto same_logical_tensors = + [](const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2) -> bool { + auto const &same_logical_tensors = SameLogicalTensors(); + return same_logical_tensors(x1, x2); + }; + m.def("_same_logical_tensors", same_logical_tensors, + "Determines if the memory regions indexed by each array are the same", + py::arg("array1"), py::arg("array2")); // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), // py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), From 29d6c029190714cab8a460c02f32130c7ea59cc6 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 05:14:28 -0800 Subject: [PATCH 10/50] Add device_support_queries to enable default device types --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../source/device_support_queries.cpp | 184 ++++++++++++++++++ .../source/device_support_queries.hpp | 58 ++++++ .../tensor/libtensor/source/tensor_ctors.cpp | 56 +++--- 4 files changed, 271 insertions(+), 29 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ed8294b76615..ee8da2e49506 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -56,7 +56,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp new file mode 100644 index 000000000000..51eb7dba1b6c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp @@ -0,0 +1,184 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace +{ + +std::string _default_device_fp_type(const sycl::device &d) +{ + if (d.has(sycl::aspect::fp64)) { + return "f8"; + } + else { + return "f4"; + } +} + +int get_numpy_major_version() +{ + namespace py = pybind11; + + py::module_ numpy = py::module_::import("numpy"); + py::str version_string = numpy.attr("__version__"); + py::module_ numpy_lib = py::module_::import("numpy.lib"); + + py::object numpy_version = numpy_lib.attr("NumpyVersion")(version_string); + int major_version = numpy_version.attr("major").cast(); + + return major_version; +} + +std::string _default_device_int_type(const sycl::device &) +{ + const int np_ver = get_numpy_major_version(); + + if (np_ver >= 2) { + return "i8"; + } + else { + // code for numpy.dtype('long') to be consistent + // with NumPy's default integer type across + // platforms. + return "l"; + } +} + +std::string _default_device_uint_type(const sycl::device &) +{ + const int np_ver = get_numpy_major_version(); + + if (np_ver >= 2) { + return "u8"; + } + else { + // code for numpy.dtype('long') to be consistent + // with NumPy's default integer type across + // platforms. + return "L"; + } +} + +std::string _default_device_complex_type(const sycl::device &d) +{ + if (d.has(sycl::aspect::fp64)) { + return "c16"; + } + else { + return "c8"; + } +} + +std::string _default_device_bool_type(const sycl::device &) +{ + return "b1"; +} + +std::string _default_device_index_type(const sycl::device &) +{ + return "i8"; +} + +sycl::device _extract_device(const py::object &arg) +{ + auto const &api = dpctl::detail::dpctl_capi::get(); + + PyObject *source = arg.ptr(); + if (api.PySyclQueue_Check_(source)) { + const sycl::queue &q = py::cast(arg); + return q.get_device(); + } + else if (api.PySyclDevice_Check_(source)) { + return py::cast(arg); + } + else { + throw py::type_error( + "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`."); + } +} + +} // namespace + +std::string default_device_fp_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_fp_type(d); +} + +std::string default_device_int_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_int_type(d); +} + +std::string default_device_uint_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_uint_type(d); +} + +std::string default_device_bool_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_bool_type(d); +} + +std::string default_device_complex_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_complex_type(d); +} + +std::string default_device_index_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_index_type(d); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp new file mode 100644 index 000000000000..6ea01dcd49d7 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp @@ -0,0 +1,58 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::string default_device_fp_type(const py::object &); +extern std::string default_device_int_type(const py::object &); +extern std::string default_device_uint_type(const py::object &); +extern std::string default_device_bool_type(const py::object &); +extern std::string default_device_complex_type(const py::object &); +extern std::string default_device_index_type(const py::object &); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index ca3b7bd49116..911d75ebd925 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -52,7 +52,7 @@ // #include "copy_for_reshape.hpp" // #include "copy_for_roll.hpp" // #include "copy_numpy_ndarray_into_usm_ndarray.hpp" -// #include "device_support_queries.hpp" +#include "device_support_queries.hpp" // #include "eye_ctor.hpp" // #include "full_ctor.hpp" // #include "integer_advanced_indexing.hpp" @@ -360,33 +360,33 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("k"), py::arg("dst"), py::arg("sycl_queue"), // py::arg("depends") = py::list()); - // m.def("default_device_fp_type", - // dpctl::tensor::py_internal::default_device_fp_type, - // "Gives default floating point type supported by device.", - // py::arg("dev")); - - // m.def("default_device_int_type", - // dpctl::tensor::py_internal::default_device_int_type, - // "Gives default signed integer type supported by device.", - // py::arg("dev")); - - // m.def("default_device_uint_type", - // dpctl::tensor::py_internal::default_device_uint_type, - // "Gives default unsigned integer type supported by device.", - // py::arg("dev")); - - // m.def("default_device_bool_type", - // dpctl::tensor::py_internal::default_device_bool_type, - // "Gives default boolean type supported by device.", py::arg("dev")); - - // m.def("default_device_complex_type", - // dpctl::tensor::py_internal::default_device_complex_type, - // "Gives default complex floating point type supported by device.", - // py::arg("dev")); - - // m.def("default_device_index_type", - // dpctl::tensor::py_internal::default_device_index_type, - // "Gives default index type supported by device.", py::arg("dev")); + m.def("default_device_fp_type", + dpctl::tensor::py_internal::default_device_fp_type, + "Gives default floating point type supported by device.", + py::arg("dev")); + + m.def("default_device_int_type", + dpctl::tensor::py_internal::default_device_int_type, + "Gives default signed integer type supported by device.", + py::arg("dev")); + + m.def("default_device_uint_type", + dpctl::tensor::py_internal::default_device_uint_type, + "Gives default unsigned integer type supported by device.", + py::arg("dev")); + + m.def("default_device_bool_type", + dpctl::tensor::py_internal::default_device_bool_type, + "Gives default boolean type supported by device.", py::arg("dev")); + + m.def("default_device_complex_type", + dpctl::tensor::py_internal::default_device_complex_type, + "Gives default complex floating point type supported by device.", + py::arg("dev")); + + m.def("default_device_index_type", + dpctl::tensor::py_internal::default_device_index_type, + "Gives default index type supported by device.", py::arg("dev")); // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, From 936e7198e2014330b34c5918a63230ea699e063e Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 05:52:17 -0800 Subject: [PATCH 11/50] Enable building and packaging of dpctl_ext --- CMakeLists.txt | 1 + setup.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 386b17b44294..d2ee5e84c0c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -336,3 +336,4 @@ if(DEFINED SKBUILD) endif() add_subdirectory(dpnp) +add_subdirectory(dpctl_ext) diff --git a/setup.py b/setup.py index cc21221299c4..a0c54b066dcf 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,9 @@ "dpnp.scipy", "dpnp.scipy.linalg", "dpnp.scipy.special", + # dpctl_ext + "dpctl_ext", + "dpctl_ext.tensor", ], package_data={ "dpnp": [ From cd85f1e333bcad154272946f71c127b9ea9a916b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 06:14:39 -0800 Subject: [PATCH 12/50] Use _tensor_impl from dpctl_ext.tensor in dpnp --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 2 +- dpnp/dpnp_iface.py | 2 +- dpnp/dpnp_iface_searching.py | 2 +- dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 +- dpnp/scipy/linalg/_utils.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index 57bf50422fa0..b63bf61f8dad 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -31,7 +31,6 @@ import dpctl.tensor as dpt import dpctl.tensor._copy_utils as dtc -import dpctl.tensor._tensor_impl as dti import dpctl.tensor._type_utils as dtu import dpctl.utils as dpu import numpy @@ -45,6 +44,7 @@ _validate_dtype, ) +import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi from dpnp.dpnp_array import dpnp_array diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index fba1a215756a..832446c826ba 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -45,11 +45,11 @@ import dpctl import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._device import normalize_queue_device +import dpctl_ext.tensor._tensor_impl as ti import dpnp from .dpnp_array import dpnp_array diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index 6eefe010b699..fdbd317d31dd 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -40,8 +40,8 @@ """ import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as dti +import dpctl_ext.tensor._tensor_impl as dti import dpnp from .dpnp_array import dpnp_array diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 30be5d1ff5cb..4d8e3cdfbd0d 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -28,7 +28,6 @@ import dpctl import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( @@ -38,6 +37,7 @@ ) from dpctl.utils import ExecutionPlacementError +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.blas._blas_impl as bi from dpnp.dpnp_array import dpnp_array diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py index 282c645d1095..8eb9187236bf 100644 --- a/dpnp/scipy/linalg/_utils.py +++ b/dpnp/scipy/linalg/_utils.py @@ -42,9 +42,9 @@ from warnings import warn -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li from dpnp.dpnp_utils import get_usm_allocations From 0c6780a8f8b45e87263fbf316bc17aac5ed91dc1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 09:56:50 -0800 Subject: [PATCH 13/50] Move put() and take() to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- dpctl_ext/tensor/__init__.py | 11 + dpctl_ext/tensor/_indexing_functions.py | 329 +++++++ dpctl_ext/tensor/_numpy_helper.py | 45 + .../kernels/integer_advanced_indexing.hpp | 427 +++++++++ .../source/integer_advanced_indexing.cpp | 819 ++++++++++++++++++ .../source/integer_advanced_indexing.hpp | 73 ++ .../tensor/libtensor/source/tensor_ctors.cpp | 42 +- 8 files changed, 1726 insertions(+), 22 deletions(-) create mode 100644 dpctl_ext/tensor/_indexing_functions.py create mode 100644 dpctl_ext/tensor/_numpy_helper.py create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ee8da2e49506..ae8b72d71873 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -49,7 +49,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index a71324cb88d8..35453dbf9a46 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -25,3 +25,14 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** + + +from dpctl_ext.tensor._indexing_functions import ( + put, + take, +) + +__all__ = [ + "put", + "take", +] diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py new file mode 100644 index 000000000000..106df09cf97e --- /dev/null +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -0,0 +1,329 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +import dpctl +import dpctl.tensor as dpt +import dpctl.utils + +import dpctl_ext.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_index + + +def _get_indexing_mode(name): + modes = {"wrap": 0, "clip": 1} + try: + return modes[name] + except KeyError: + raise ValueError( + "`mode` must be `wrap` or `clip`." "Got `{}`.".format(name) + ) + + +def put(x, indices, vals, /, *, axis=None, mode="wrap"): + """put(x, indices, vals, axis=None, mode="wrap") + + Puts values into an array along a given axis at given indices. + + Args: + x (usm_ndarray): + The array the values will be put into. + indices (usm_ndarray): + One-dimensional array of indices. + vals (usm_ndarray): + Array of values to be put into ``x``. + Must be broadcastable to the result shape + ``x.shape[:axis] + indices.shape + x.shape[axis+1:]``. + axis (int, optional): + The axis along which the values will be placed. + If ``x`` is one-dimensional, this argument is optional. + Default: ``None``. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + .. note:: + + If input array ``indices`` contains duplicates, a race condition + occurs, and the value written into corresponding positions in ``x`` + may vary from run to run. Preserving sequential semantics in handing + the duplicates to achieve deterministic behavior requires additional + work, e.g. + + :Example: + + .. code-block:: python + + from dpctl import tensor as dpt + + def put_vec_duplicates(vec, ind, vals): + "Put values into vec, handling possible duplicates in ind" + assert vec.ndim, ind.ndim, vals.ndim == 1, 1, 1 + + # find positions of last occurrences of each + # unique index + ind_flipped = dpt.flip(ind) + ind_uniq = dpt.unique_all(ind_flipped).indices + has_dups = len(ind) != len(ind_uniq) + + if has_dups: + ind_uniq = dpt.subtract(vec.size - 1, ind_uniq) + ind = dpt.take(ind, ind_uniq) + vals = dpt.take(vals, ind_uniq) + + dpt.put(vec, ind, vals) + + n = 512 + ind = dpt.concat((dpt.arange(n), dpt.arange(n, -1, step=-1))) + x = dpt.zeros(ind.size, dtype="int32") + vals = dpt.arange(ind.size, dtype=x.dtype) + + # Values corresponding to last positions of + # duplicate indices are written into the vector x + put_vec_duplicates(x, ind, vals) + + parts = (vals[-1:-n-2:-1], dpt.zeros(n, dtype=x.dtype)) + expected = dpt.concat(parts) + assert dpt.all(x == expected) + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(indices) + ) + ) + if isinstance(vals, dpt.usm_ndarray): + queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type] + else: + queues_ = [x.sycl_queue, indices.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type] + if indices.ndim != 1: + raise ValueError( + "`indices` expected a 1D array, got `{}`".format(indices.ndim) + ) + if indices.dtype.kind not in "ui": + raise IndexError( + "`indices` expected integer data type, got `{}`".format( + indices.dtype + ) + ) + exec_q = dpctl.utils.get_execution_queue(queues_) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) + + mode = _get_indexing_mode(mode) + + x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format( + x_ndim + ) + ) + axis = 0 + + if x_ndim > 0: + axis = normalize_axis_index(operator.index(axis), x_ndim) + x_sh = x.shape + if x_sh[axis] == 0 and indices.size != 0: + raise IndexError("cannot take non-empty indices from an empty axis") + val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :] + else: + if axis != 0: + raise ValueError("`axis` must be 0 for an array of dimension 0.") + val_shape = indices.shape + + if not isinstance(vals, dpt.usm_ndarray): + vals = dpt.asarray( + vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q + ) + # choose to throw here for consistency with `place` + if vals.size == 0: + raise ValueError( + "cannot put into non-empty indices along an empty axis" + ) + if vals.dtype == x.dtype: + rhs = vals + else: + rhs = dpt.astype(vals, x.dtype) + rhs = dpt.broadcast_to(rhs, val_shape) + + _manager = dpctl.utils.SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + hev, put_ev = ti._put( + x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, put_ev) + + +def take(x, indices, /, *, axis=None, out=None, mode="wrap"): + """take(x, indices, axis=None, out=None, mode="wrap") + + Takes elements from an array along a given axis at given indices. + + Args: + x (usm_ndarray): + The array that elements will be taken from. + indices (usm_ndarray): + One-dimensional array of indices. + axis (int, optional): + The axis along which the values will be selected. + If ``x`` is one-dimensional, this argument is optional. + Default: ``None``. + out (Optional[usm_ndarray]): + Output array to populate. Array must have the correct + shape and the expected data type. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + Returns: + usm_ndarray: + Array with shape + ``x.shape[:axis] + indices.shape + x.shape[axis + 1:]`` + filled with elements from ``x``. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(indices) + ) + ) + if indices.dtype.kind not in "ui": + raise IndexError( + "`indices` expected integer data type, got `{}`".format( + indices.dtype + ) + ) + if indices.ndim != 1: + raise ValueError( + "`indices` expected a 1D array, got `{}`".format(indices.ndim) + ) + exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, indices.sycl_queue]) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + res_usm_type = dpctl.utils.get_coerced_usm_type( + [x.usm_type, indices.usm_type] + ) + + mode = _get_indexing_mode(mode) + + x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format( + x_ndim + ) + ) + axis = 0 + + if x_ndim > 0: + axis = normalize_axis_index(operator.index(axis), x_ndim) + x_sh = x.shape + if x_sh[axis] == 0 and indices.size != 0: + raise IndexError("cannot take non-empty indices from an empty axis") + res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :] + else: + if axis != 0: + raise ValueError("`axis` must be 0 for an array of dimension 0.") + res_shape = indices.shape + + dt = x.dtype + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {res_shape}, got {out.shape}" + ) + if dt != out.dtype: + raise ValueError( + f"Output array of type {dt} is needed, got {out.dtype}" + ) + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpctl.utils.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + if ti._array_overlap(x, out): + out = dpt.empty_like(out) + else: + out = dpt.empty( + res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = dpctl.utils.SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + hev, take_ev = ti._take( + x, (indices,), out, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, take_ev) + + if not (orig_out is None or out is orig_out): + # Copy the out data from temporary buffer to original memory + ht_e_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[take_ev] + ) + _manager.add_event_pair(ht_e_cpy, cpy_ev) + out = orig_out + + return out diff --git a/dpctl_ext/tensor/_numpy_helper.py b/dpctl_ext/tensor/_numpy_helper.py new file mode 100644 index 000000000000..4ad735823cb3 --- /dev/null +++ b/dpctl_ext/tensor/_numpy_helper.py @@ -0,0 +1,45 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +import numpy as np + +_npver = np.lib.NumpyVersion(np.__version__) + +if _npver < "1.25.0": # pragma: no cover + from numpy import AxisError +else: + from numpy.exceptions import AxisError + +if _npver >= "2.0.0": + from numpy._core.numeric import normalize_axis_index, normalize_axis_tuple +else: # pragma: no cover + from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple + + +__all__ = ["AxisError", "normalize_axis_index", "normalize_axis_tuple"] diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp new file mode 100644 index 000000000000..1b2c79d2e2a5 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -0,0 +1,427 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for advanced tensor index operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/indexing_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace indexing +{ + +using dpctl::tensor::ssize_t; + +template +class TakeFunctor +{ +private: + const char *src_ = nullptr; + char *dst_ = nullptr; + char **ind_ = nullptr; + int k_ = 0; + std::size_t ind_nelems_ = 0; + const ssize_t *axes_shape_and_strides_ = nullptr; + OrthogIndexer orthog_strider; + IndicesIndexer ind_strider; + AxesIndexer axes_strider; + +public: + TakeFunctor(const char *src_cp, + char *dst_cp, + char **ind_cp, + int k, + std::size_t ind_nelems, + const ssize_t *axes_shape_and_strides, + const OrthogIndexer &orthog_strider_, + const IndicesIndexer &ind_strider_, + const AxesIndexer &axes_strider_) + : src_(src_cp), dst_(dst_cp), ind_(ind_cp), k_(k), + ind_nelems_(ind_nelems), + axes_shape_and_strides_(axes_shape_and_strides), + orthog_strider(orthog_strider_), ind_strider(ind_strider_), + axes_strider(axes_strider_) + { + } + + void operator()(sycl::id<1> id) const + { + const T *src = reinterpret_cast(src_); + T *dst = reinterpret_cast(dst_); + + ssize_t i_orthog = id / ind_nelems_; + ssize_t i_along = id - (i_orthog * ind_nelems_); + + auto orthog_offsets = orthog_strider(i_orthog); + + ssize_t src_offset = orthog_offsets.get_first_offset(); + ssize_t dst_offset = orthog_offsets.get_second_offset(); + + static constexpr ProjectorT proj{}; + for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + indT *ind_data = reinterpret_cast(ind_[axis_idx]); + + ssize_t ind_offset = ind_strider(i_along, axis_idx); + // proj produces an index in the range of the given axis + ssize_t projected_idx = + proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]); + src_offset += + projected_idx * axes_shape_and_strides_[k_ + axis_idx]; + } + + dst_offset += axes_strider(i_along); + + dst[dst_offset] = src[src_offset]; + } +}; + +template +class take_kernel; + +typedef sycl::event (*take_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + int, + int, + int, + const ssize_t *, + const ssize_t *, + const ssize_t *, + const char *, + char *, + char **, + ssize_t, + ssize_t, + const ssize_t *, + const std::vector &); + +template +sycl::event take_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t ind_nelems, + int nd, + int ind_nd, + int k, + const ssize_t *orthog_shape_and_strides, + const ssize_t *axes_shape_and_strides, + const ssize_t *ind_shape_and_strides, + const char *src_p, + char *dst_p, + char **ind_p, + ssize_t src_offset, + ssize_t dst_offset, + const ssize_t *ind_offsets, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event take_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using OrthogIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + const OrthogIndexerT orthog_indexer{nd, src_offset, dst_offset, + orthog_shape_and_strides}; + + using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset; + const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets, + ind_shape_and_strides}; + + using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const AxesIndexerT axes_indexer{ind_nd, 0, + axes_shape_and_strides + (2 * k)}; + + using KernelName = + take_kernel; + + const std::size_t gws = orthog_nelems * ind_nelems; + + cgh.parallel_for( + sycl::range<1>(gws), + TakeFunctor( + src_p, dst_p, ind_p, k, ind_nelems, axes_shape_and_strides, + orthog_indexer, indices_indexer, axes_indexer)); + }); + + return take_ev; +} + +template +class PutFunctor +{ +private: + char *dst_ = nullptr; + const char *val_ = nullptr; + char **ind_ = nullptr; + int k_ = 0; + std::size_t ind_nelems_ = 0; + const ssize_t *axes_shape_and_strides_ = nullptr; + OrthogIndexer orthog_strider; + IndicesIndexer ind_strider; + AxesIndexer axes_strider; + +public: + PutFunctor(char *dst_cp, + const char *val_cp, + char **ind_cp, + int k, + std::size_t ind_nelems, + const ssize_t *axes_shape_and_strides, + const OrthogIndexer &orthog_strider_, + const IndicesIndexer &ind_strider_, + const AxesIndexer &axes_strider_) + : dst_(dst_cp), val_(val_cp), ind_(ind_cp), k_(k), + ind_nelems_(ind_nelems), + axes_shape_and_strides_(axes_shape_and_strides), + orthog_strider(orthog_strider_), ind_strider(ind_strider_), + axes_strider(axes_strider_) + { + } + + void operator()(sycl::id<1> id) const + { + T *dst = reinterpret_cast(dst_); + const T *val = reinterpret_cast(val_); + + ssize_t i_orthog = id / ind_nelems_; + ssize_t i_along = id - (i_orthog * ind_nelems_); + + auto orthog_offsets = orthog_strider(i_orthog); + + ssize_t dst_offset = orthog_offsets.get_first_offset(); + ssize_t val_offset = orthog_offsets.get_second_offset(); + + static constexpr ProjectorT proj{}; + for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + indT *ind_data = reinterpret_cast(ind_[axis_idx]); + + ssize_t ind_offset = ind_strider(i_along, axis_idx); + + // proj produces an index in the range of the given axis + ssize_t projected_idx = + proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]); + dst_offset += + projected_idx * axes_shape_and_strides_[k_ + axis_idx]; + } + + val_offset += axes_strider(i_along); + + dst[dst_offset] = val[val_offset]; + } +}; + +template +class put_kernel; + +typedef sycl::event (*put_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + int, + int, + int, + const ssize_t *, + const ssize_t *, + const ssize_t *, + char *, + const char *, + char **, + ssize_t, + ssize_t, + const ssize_t *, + const std::vector &); + +template +sycl::event put_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t ind_nelems, + int nd, + int ind_nd, + int k, + const ssize_t *orthog_shape_and_strides, + const ssize_t *axes_shape_and_strides, + const ssize_t *ind_shape_and_strides, + char *dst_p, + const char *val_p, + char **ind_p, + ssize_t dst_offset, + ssize_t val_offset, + const ssize_t *ind_offsets, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event put_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using OrthogIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + const OrthogIndexerT orthog_indexer{nd, dst_offset, val_offset, + orthog_shape_and_strides}; + + using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset; + const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets, + ind_shape_and_strides}; + + using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const AxesIndexerT axes_indexer{ind_nd, 0, + axes_shape_and_strides + (2 * k)}; + + using KernelName = + put_kernel; + + const std::size_t gws = orthog_nelems * ind_nelems; + + cgh.parallel_for( + sycl::range<1>(gws), + PutFunctor( + dst_p, val_p, ind_p, k, ind_nelems, axes_shape_and_strides, + orthog_indexer, indices_indexer, axes_indexer)); + }); + + return put_ev; +} + +template +struct TakeWrapFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::WrapIndex; + fnT fn = take_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct TakeClipFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::ClipIndex; + fnT fn = take_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct PutWrapFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::WrapIndex; + fnT fn = put_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct PutClipFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::ClipIndex; + fnT fn = put_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +} // namespace indexing +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp new file mode 100644 index 000000000000..244acfe3955f --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -0,0 +1,819 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines implementation functions of dpctl.tensor.take and +/// dpctl.tensor.put +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "kernels/integer_advanced_indexing.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "integer_advanced_indexing.hpp" + +#define INDEXING_MODES 2 +#define WRAP_MODE 0 +#define CLIP_MODE 1 + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::indexing::put_fn_ptr_t; +using dpctl::tensor::kernels::indexing::take_fn_ptr_t; + +static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][td_ns::num_types] + [td_ns::num_types]; + +static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][td_ns::num_types] + [td_ns::num_types]; + +namespace py = pybind11; + +using dpctl::utils::keep_args_alive; + +std::vector + _populate_kernel_params(sycl::queue &exec_q, + std::vector &host_task_events, + char **device_ind_ptrs, + py::ssize_t *device_ind_sh_st, + py::ssize_t *device_ind_offsets, + py::ssize_t *device_orthog_sh_st, + py::ssize_t *device_along_sh_st, + const py::ssize_t *inp_shape, + const py::ssize_t *arr_shape, + std::vector &inp_strides, + std::vector &arr_strides, + std::vector &ind_sh_sts, + std::vector &ind_ptrs, + std::vector &ind_offsets, + int axis_start, + int k, + int ind_nd, + int inp_nd, + int orthog_sh_elems, + int ind_sh_elems) +{ + + using usm_host_allocator_T = + dpctl::tensor::alloc_utils::usm_host_allocator; + using ptrT = std::vector; + + usm_host_allocator_T ptr_allocator(exec_q); + std::shared_ptr host_ind_ptrs_shp = + std::make_shared(k, ptr_allocator); + + using usm_host_allocatorT = + dpctl::tensor::alloc_utils::usm_host_allocator; + using shT = std::vector; + + usm_host_allocatorT sz_allocator(exec_q); + std::shared_ptr host_ind_sh_st_shp = + std::make_shared(ind_sh_elems * (k + 1), sz_allocator); + + std::shared_ptr host_ind_offsets_shp = + std::make_shared(k, sz_allocator); + + std::shared_ptr host_orthog_sh_st_shp = + std::make_shared(3 * orthog_sh_elems, sz_allocator); + + std::shared_ptr host_along_sh_st_shp = + std::make_shared(2 * (k + ind_sh_elems), sz_allocator); + + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_sh_st_shp->begin()); + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); + + const sycl::event &device_ind_ptrs_copy_ev = exec_q.copy( + host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size()); + + const sycl::event &device_ind_sh_st_copy_ev = + exec_q.copy(host_ind_sh_st_shp->data(), device_ind_sh_st, + host_ind_sh_st_shp->size()); + + const sycl::event &device_ind_offsets_copy_ev = exec_q.copy( + host_ind_offsets_shp->data(), device_ind_offsets, + host_ind_offsets_shp->size()); + + int orthog_nd = inp_nd - k; + + if (orthog_nd > 0) { + if (axis_start > 0) { + std::copy(inp_shape, inp_shape + axis_start, + host_orthog_sh_st_shp->begin()); + std::copy(inp_strides.begin(), inp_strides.begin() + axis_start, + host_orthog_sh_st_shp->begin() + orthog_sh_elems); + std::copy(arr_strides.begin(), arr_strides.begin() + axis_start, + host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems); + } + if (inp_nd > (axis_start + k)) { + std::copy(inp_shape + axis_start + k, inp_shape + inp_nd, + host_orthog_sh_st_shp->begin() + axis_start); + std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(), + host_orthog_sh_st_shp->begin() + orthog_sh_elems + + axis_start); + + std::copy(arr_strides.begin() + axis_start + ind_nd, + arr_strides.end(), + host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems + + axis_start); + } + } + + if (inp_nd > 0) { + std::copy(inp_shape + axis_start, inp_shape + axis_start + k, + host_along_sh_st_shp->begin()); + + std::copy(inp_strides.begin() + axis_start, + inp_strides.begin() + axis_start + k, + host_along_sh_st_shp->begin() + k); + } + + if (ind_nd > 0) { + std::copy(arr_shape + axis_start, arr_shape + axis_start + ind_nd, + host_along_sh_st_shp->begin() + 2 * k); + std::copy(arr_strides.begin() + axis_start, + arr_strides.begin() + axis_start + ind_nd, + host_along_sh_st_shp->begin() + 2 * k + ind_nd); + } + + const sycl::event &device_orthog_sh_st_copy_ev = exec_q.copy( + host_orthog_sh_st_shp->data(), device_orthog_sh_st, + host_orthog_sh_st_shp->size()); + + const sycl::event &device_along_sh_st_copy_ev = exec_q.copy( + host_along_sh_st_shp->data(), device_along_sh_st, + host_along_sh_st_shp->size()); + + const sycl::event &shared_ptr_cleanup_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on({device_along_sh_st_copy_ev, + device_orthog_sh_st_copy_ev, + device_ind_offsets_copy_ev, + device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev}); + cgh.host_task( + [host_ind_offsets_shp = std::move(host_ind_offsets_shp), + host_ind_sh_st_shp = std::move(host_ind_sh_st_shp), + host_ind_ptrs_shp = std::move(host_ind_ptrs_shp), + host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp), + host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {}); + }); + host_task_events.push_back(shared_ptr_cleanup_ev); + + std::vector sh_st_pack_deps{ + device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev, + device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev, + device_along_sh_st_copy_ev}; + return sh_st_pack_deps; +} + +/* Utility to parse python object py_ind into vector of `usm_ndarray`s */ +std::vector parse_py_ind(const sycl::queue &q, + const py::object &py_ind) +{ + std::size_t ind_count = py::len(py_ind); + std::vector res; + res.reserve(ind_count); + + bool nd_is_known = false; + int nd = -1; + for (std::size_t i = 0; i < ind_count; ++i) { + py::object el_i = py_ind[py::cast(i)]; + dpctl::tensor::usm_ndarray arr_i = + py::cast(el_i); + if (!dpctl::utils::queues_are_compatible(q, {arr_i})) { + throw py::value_error("Index allocation queue is not compatible " + "with execution queue"); + } + if (nd_is_known) { + if (nd != arr_i.get_ndim()) { + throw py::value_error( + "Indices must have the same number of dimensions."); + } + } + else { + nd_is_known = true; + nd = arr_i.get_ndim(); + } + res.push_back(arr_i); + } + + return res; +} + +std::pair + usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, + const py::object &py_ind, + const dpctl::tensor::usm_ndarray &dst, + int axis_start, + std::uint8_t mode, + sycl::queue &exec_q, + const std::vector &depends) +{ + std::vector ind = parse_py_ind(exec_q, py_ind); + + int k = ind.size(); + + if (k == 0) { + throw py::value_error("List of indices is empty."); + } + + if (axis_start < 0) { + throw py::value_error("Axis cannot be negative."); + } + + if (mode != 0 && mode != 1) { + throw py::value_error("Mode must be 0 or 1."); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; + + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + int ind_nd = ind_rep.get_ndim(); + + auto sh_elems = std::max(src_nd, 1); + + if (axis_start + k > sh_elems) { + throw py::value_error("Axes are out of range for array of dimension " + + std::to_string(src_nd)); + } + if (src_nd == 0) { + if (dst_nd != ind_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for take kernel."); + } + } + else { + if (dst_nd != (src_nd - k + ind_nd)) { + throw py::value_error( + "Destination is not of appropriate dimension for take kernel."); + } + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool orthog_shapes_equal(true); + std::size_t orthog_nelems(1); + for (int i = 0; i < (src_nd - k); ++i) { + auto idx1 = (i < axis_start) ? i : i + k; + auto idx2 = (i < axis_start) ? i : i + ind_nd; + + orthog_nelems *= static_cast(src_shape[idx1]); + orthog_shapes_equal = + orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]); + } + + if (!orthog_shapes_equal) { + throw py::value_error( + "Axes of basic indices are not of matching shapes."); + } + + if (orthog_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Array memory overlap."); + } + + py::ssize_t src_offset = py::ssize_t(0); + py::ssize_t dst_offset = py::ssize_t(0); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::type_error("Array data types are not the same."); + } + + const py::ssize_t *ind_shape = ind_rep.get_shape_raw(); + + int ind_typenum = ind_rep.get_typenum(); + int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); + + std::size_t ind_nelems(1); + for (int i = 0; i < ind_nd; ++i) { + ind_nelems *= static_cast(ind_shape[i]); + + if (!(ind_shape[i] == dst_shape[axis_start + i])) { + throw py::value_error( + "Indices shape does not match shape of axis in destination."); + } + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, orthog_nelems * ind_nelems); + + int ind_sh_elems = std::max(ind_nd, 1); + + std::vector ind_ptrs; + ind_ptrs.reserve(k); + + std::vector ind_offsets; + ind_offsets.reserve(k); + + std::vector ind_sh_sts((k + 1) * ind_sh_elems, 0); + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin()); + } + for (int i = 0; i < k; ++i) { + dpctl::tensor::usm_ndarray ind_ = ind[i]; + + if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // ndim, type, and shape are checked against the first array + if (i > 0) { + if (!(ind_.get_ndim() == ind_nd)) { + throw py::value_error("Index dimensions are not the same"); + } + + if (!(ind_type_id == + array_types.typenum_to_lookup_id(ind_.get_typenum()))) { + throw py::type_error( + "Indices array data types are not all the same."); + } + + const py::ssize_t *ind_shape_ = ind_.get_shape_raw(); + for (int dim = 0; dim < ind_nd; ++dim) { + if (!(ind_shape[dim] == ind_shape_[dim])) { + throw py::value_error("Indices shapes are not all equal."); + } + } + } + + // check for overlap with destination + if (overlap(dst, ind_)) { + throw py::value_error( + "Arrays index overlapping segments of memory"); + } + + char *ind_data = ind_.get_data(); + + // strides are initialized to 0 for 0D indices, so skip here + if (ind_nd > 0) { + auto ind_strides = ind_.get_strides_vector(); + std::copy(ind_strides.begin(), ind_strides.end(), + ind_sh_sts.begin() + (i + 1) * ind_nd); + } + + ind_ptrs.push_back(ind_data); + ind_offsets.push_back(py::ssize_t(0)); + } + + if (ind_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); + + // rearrange to past where indices shapes are checked + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); + py::ssize_t *packed_ind_shapes_strides = + packed_ind_shapes_strides_owner.get(); + + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); + + int orthog_sh_elems = std::max(src_nd - k, 1); + + // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:], + // src_strides[:axis] + src_strides[axis+k:], + // dst_strides[:axis] + + // dst_strides[axis+ind.ndim:]] + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); + + // packed_axes_shapes_strides = [src_shape[axis:axis+k], + // src_strides[axis:axis+k], + // dst_shape[axis:axis+ind.ndim], + // dst_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); + py::ssize_t *packed_axes_shapes_strides = + packed_axes_shapes_strides_owner.get(); + + auto src_strides = src.get_strides_vector(); + auto dst_strides = dst.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + std::vector pack_deps = _populate_kernel_params( + exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides, + packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides, + src_shape, dst_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs, + ind_offsets, axis_start, k, ind_nd, src_nd, orthog_sh_elems, + ind_sh_elems); + + std::vector all_deps; + all_deps.reserve(depends.size() + pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(pack_deps), + std::end(pack_deps)); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + auto fn = take_dispatch_table[mode][src_type_id][ind_type_id]; + + if (fn == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Indices must be integer type, got " + + std::to_string(ind_type_id)); + } + + sycl::event take_generic_ev = + fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k, + packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs, + src_offset, dst_offset, packed_ind_offsets, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {take_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, take_generic_ev); +} + +std::pair + usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, + const py::object &py_ind, + const dpctl::tensor::usm_ndarray &val, + int axis_start, + std::uint8_t mode, + sycl::queue &exec_q, + const std::vector &depends) +{ + std::vector ind = parse_py_ind(exec_q, py_ind); + int k = ind.size(); + + if (k == 0) { + // no indices to write to + throw py::value_error("List of indices is empty."); + } + + if (axis_start < 0) { + throw py::value_error("Axis cannot be negative."); + } + + if (mode != 0 && mode != 1) { + throw py::value_error("Mode must be 0 or 1."); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; + + int dst_nd = dst.get_ndim(); + int val_nd = val.get_ndim(); + int ind_nd = ind_rep.get_ndim(); + + auto sh_elems = std::max(dst_nd, 1); + + if (axis_start + k > sh_elems) { + throw py::value_error("Axes are out of range for array of dimension " + + std::to_string(dst_nd)); + } + if (dst_nd == 0) { + if (val_nd != ind_nd) { + throw py::value_error("Destination is not of appropriate dimension " + "for put function."); + } + } + else { + if (val_nd != (dst_nd - k + ind_nd)) { + throw py::value_error("Destination is not of appropriate dimension " + "for put function."); + } + } + + std::size_t dst_nelems = dst.get_size(); + + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *val_shape = val.get_shape_raw(); + + bool orthog_shapes_equal(true); + std::size_t orthog_nelems(1); + for (int i = 0; i < (dst_nd - k); ++i) { + auto idx1 = (i < axis_start) ? i : i + k; + auto idx2 = (i < axis_start) ? i : i + ind_nd; + + orthog_nelems *= static_cast(dst_shape[idx1]); + orthog_shapes_equal = + orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]); + } + + if (!orthog_shapes_equal) { + throw py::value_error( + "Axes of basic indices are not of matching shapes."); + } + + if (orthog_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + char *dst_data = dst.get_data(); + char *val_data = val.get_data(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(val, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + py::ssize_t dst_offset = py::ssize_t(0); + py::ssize_t val_offset = py::ssize_t(0); + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + int dst_typenum = dst.get_typenum(); + int val_typenum = val.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + int val_type_id = array_types.typenum_to_lookup_id(val_typenum); + + if (dst_type_id != val_type_id) { + throw py::type_error("Array data types are not the same."); + } + + const py::ssize_t *ind_shape = ind_rep.get_shape_raw(); + + int ind_typenum = ind_rep.get_typenum(); + int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); + + std::size_t ind_nelems(1); + for (int i = 0; i < ind_nd; ++i) { + ind_nelems *= static_cast(ind_shape[i]); + + if (!(ind_shape[i] == val_shape[axis_start + i])) { + throw py::value_error( + "Indices shapes does not match shape of axis in vals."); + } + } + + auto ind_sh_elems = std::max(ind_nd, 1); + + std::vector ind_ptrs; + ind_ptrs.reserve(k); + std::vector ind_offsets; + ind_offsets.reserve(k); + std::vector ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0)); + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin()); + } + for (int i = 0; i < k; ++i) { + dpctl::tensor::usm_ndarray ind_ = ind[i]; + + if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // ndim, type, and shape are checked against the first array + if (i > 0) { + if (!(ind_.get_ndim() == ind_nd)) { + throw py::value_error("Index dimensions are not the same"); + } + + if (!(ind_type_id == + array_types.typenum_to_lookup_id(ind_.get_typenum()))) { + throw py::type_error( + "Indices array data types are not all the same."); + } + + const py::ssize_t *ind_shape_ = ind_.get_shape_raw(); + for (int dim = 0; dim < ind_nd; ++dim) { + if (!(ind_shape[dim] == ind_shape_[dim])) { + throw py::value_error("Indices shapes are not all equal."); + } + } + } + + // check for overlap with destination + if (overlap(ind_, dst)) { + throw py::value_error( + "Arrays index overlapping segments of memory"); + } + + char *ind_data = ind_.get_data(); + + // strides are initialized to 0 for 0D indices, so skip here + if (ind_nd > 0) { + auto ind_strides = ind_.get_strides_vector(); + std::copy(ind_strides.begin(), ind_strides.end(), + ind_sh_sts.begin() + (i + 1) * ind_nd); + } + + ind_ptrs.push_back(ind_data); + ind_offsets.push_back(py::ssize_t(0)); + } + + if (ind_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); + + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); + py::ssize_t *packed_ind_shapes_strides = + packed_ind_shapes_strides_owner.get(); + + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); + + int orthog_sh_elems = std::max(dst_nd - k, 1); + + // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:], + // dst_strides[:axis] + dst_strides[axis+k:], + // val_strides[:axis] + + // val_strides[axis+ind.ndim:]] + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); + + // packed_axes_shapes_strides = [dst_shape[axis:axis+k], + // dst_strides[axis:axis+k], + // val_shape[axis:axis+ind.ndim], + // val_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); + py::ssize_t *packed_axes_shapes_strides = + packed_axes_shapes_strides_owner.get(); + + auto dst_strides = dst.get_strides_vector(); + auto val_strides = val.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + std::vector pack_deps = _populate_kernel_params( + exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides, + packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides, + dst_shape, val_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs, + ind_offsets, axis_start, k, ind_nd, dst_nd, orthog_sh_elems, + ind_sh_elems); + + std::vector all_deps; + all_deps.reserve(depends.size() + pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(pack_deps), + std::end(pack_deps)); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id]; + + if (fn == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Indices must be integer type, got " + + std::to_string(ind_type_id)); + } + + sycl::event put_generic_ev = + fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k, + packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs, + dst_offset, val_offset, packed_ind_offsets, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {put_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events); + + return std::make_pair(arg_cleanup_ev, put_generic_ev); +} + +void init_advanced_indexing_dispatch_tables(void) +{ + using namespace td_ns; + + using dpctl::tensor::kernels::indexing::TakeClipFactory; + DispatchTableBuilder + dtb_takeclip; + dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]); + + using dpctl::tensor::kernels::indexing::TakeWrapFactory; + DispatchTableBuilder + dtb_takewrap; + dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]); + + using dpctl::tensor::kernels::indexing::PutClipFactory; + DispatchTableBuilder dtb_putclip; + dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]); + + using dpctl::tensor::kernels::indexing::PutWrapFactory; + DispatchTableBuilder dtb_putwrap; + dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp new file mode 100644 index 000000000000..57f0ddda132c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -0,0 +1,73 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.take and dpctl.tensor.put +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_take(const dpctl::tensor::usm_ndarray &, + const py::object &, + const dpctl::tensor::usm_ndarray &, + int, + std::uint8_t, + sycl::queue &, + const std::vector & = {}); + +extern std::pair + usm_ndarray_put(const dpctl::tensor::usm_ndarray &, + const py::object &, + const dpctl::tensor::usm_ndarray &, + int, + std::uint8_t, + sycl::queue &, + const std::vector & = {}); + +extern void init_advanced_indexing_dispatch_tables(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 911d75ebd925..c18761031fd0 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -55,7 +55,7 @@ #include "device_support_queries.hpp" // #include "eye_ctor.hpp" // #include "full_ctor.hpp" -// #include "integer_advanced_indexing.hpp" +#include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" // #include "linear_sequences.hpp" // #include "repeat.hpp" @@ -110,8 +110,8 @@ using dpctl::tensor::py_internal::py_as_f_contig; // using dpctl::tensor::py_internal::usm_ndarray_zeros; /* ============== Advanced Indexing ============= */ -// using dpctl::tensor::py_internal::usm_ndarray_put; -// using dpctl::tensor::py_internal::usm_ndarray_take; +using dpctl::tensor::py_internal::usm_ndarray_put; +using dpctl::tensor::py_internal::usm_ndarray_take; // using dpctl::tensor::py_internal::py_extract; // using dpctl::tensor::py_internal::py_mask_positions; @@ -145,7 +145,7 @@ void init_dispatch_tables(void) init_copy_and_cast_usm_to_usm_dispatch_tables(); // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); - // init_advanced_indexing_dispatch_tables(); + init_advanced_indexing_dispatch_tables(); // init_where_dispatch_tables(); return; } @@ -332,23 +332,23 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), // py::arg("depends") = py::list()); - // m.def("_take", &usm_ndarray_take, - // "Takes elements at usm_ndarray indices `ind` and axes starting " - // "at axis `axis_start` from array `src` and copies them " - // "into usm_ndarray `dst` synchronously." - // "Returns a tuple of events: (hev, ev)", - // py::arg("src"), py::arg("ind"), py::arg("dst"), - // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); - - // m.def("_put", &usm_ndarray_put, - // "Puts elements at usm_ndarray indices `ind` and axes starting " - // "at axis `axis_start` into array `dst` from " - // "usm_ndarray `val` synchronously." - // "Returns a tuple of events: (hev, ev)", - // py::arg("dst"), py::arg("ind"), py::arg("val"), - // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_take", &usm_ndarray_take, + "Takes elements at usm_ndarray indices `ind` and axes starting " + "at axis `axis_start` from array `src` and copies them " + "into usm_ndarray `dst` synchronously." + "Returns a tuple of events: (hev, ev)", + py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"), + py::arg("mode"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_put", &usm_ndarray_put, + "Puts elements at usm_ndarray indices `ind` and axes starting " + "at axis `axis_start` into array `dst` from " + "usm_ndarray `val` synchronously." + "Returns a tuple of events: (hev, ev)", + py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"), + py::arg("mode"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("_eye", &usm_ndarray_eye, // "Fills input 2D contiguous usm_ndarray `dst` with " From 87e5482f2faf3bff2549b48c999bbab516fce168 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 09:59:18 -0800 Subject: [PATCH 14/50] Use put/take from dpctl_ext.tensor in dpnp --- dpnp/dpnp_iface_indexing.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 6e7ab778299b..6421f39fd4e4 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -52,6 +52,8 @@ from dpctl.tensor._indexing_functions import _get_indexing_mode from dpctl.tensor._numpy_helper import normalize_axis_index +import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor._tensor_impl as ti_ext import dpnp # pylint: disable=no-name-in-module @@ -295,7 +297,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): "Input and output allocation queues are not compatible" ) - if ti._array_overlap(x, out): + if ti_ext._array_overlap(x, out): # Allocate a temporary buffer to avoid memory overlapping. out = dpt.empty_like(out) else: @@ -304,7 +306,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): _manager = dpu.SequentialOrderManager[q] dep_evs = _manager.submitted_events - h_ev, take_ev = ti._take( + h_ev, take_ev = ti_ext._take( src=x, ind=(inds,), dst=out, @@ -813,7 +815,7 @@ def extract(condition, a): usm_a = dpt.reshape(usm_a, -1) usm_cond = dpt.reshape(usm_cond, -1) - usm_res = dpt.take(usm_a, dpt.nonzero(usm_cond)[0]) + usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0]) else: if usm_cond.shape != usm_a.shape: usm_a = dpt.reshape(usm_a, -1) @@ -1713,7 +1715,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"): if axis is None and usm_a.ndim > 1: usm_a = dpt.reshape(usm_a, -1) - dpt.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode) + dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode) if in_usm_a._pointer != usm_a._pointer: # pylint: disable=protected-access in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False) From b537f30115be31858782e6a7ace1fc52f54c5f9d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 10:33:51 -0800 Subject: [PATCH 15/50] Move full() to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_ctors.py | 169 ++++++++++ .../include/kernels/constructors.hpp | 171 ++++++++++ .../tensor/libtensor/source/full_ctor.cpp | 315 ++++++++++++++++++ .../tensor/libtensor/source/full_ctor.hpp | 60 ++++ .../tensor/libtensor/source/tensor_ctors.cpp | 14 +- 7 files changed, 727 insertions(+), 8 deletions(-) create mode 100644 dpctl_ext/tensor/_ctors.py create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ae8b72d71873..0c52d766afbf 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -52,7 +52,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 35453dbf9a46..9f4c27608a99 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -27,12 +27,16 @@ # ***************************************************************************** +from dpctl_ext.tensor._ctors import ( + full, +) from dpctl_ext.tensor._indexing_functions import ( put, take, ) __all__ = [ + "full", "put", "take", ] diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py new file mode 100644 index 000000000000..5caa07099c56 --- /dev/null +++ b/dpctl_ext/tensor/_ctors.py @@ -0,0 +1,169 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numbers import Number + +import dpctl +import dpctl.tensor as dpt +import dpctl.utils +import numpy as np +from dpctl.tensor._data_types import _get_dtype +from dpctl.tensor._device import normalize_queue_device + +import dpctl_ext.tensor._tensor_impl as ti + + +def _cast_fill_val(fill_val, dt): + """ + Casts the Python scalar `fill_val` to another Python type coercible to the + requested data type `dt`, if necessary. + """ + val_type = type(fill_val) + if val_type in [float, complex] and np.issubdtype(dt, np.integer): + return int(fill_val.real) + elif val_type is complex and np.issubdtype(dt, np.floating): + return fill_val.real + elif val_type is int and np.issubdtype(dt, np.integer): + return _to_scalar(fill_val, dt) + else: + return fill_val + + +def _to_scalar(obj, sc_ty): + """A way to convert object to NumPy scalar type. + Raises OverflowError if obj can not be represented + using the requested scalar type. + """ + zd_arr = np.asarray(obj, dtype=sc_ty) + return zd_arr[()] + + +def _validate_fill_value(fill_val): + """Validates that `fill_val` is a numeric or boolean scalar.""" + # TODO: verify if `np.True_` and `np.False_` should be instances of + # Number in NumPy, like other NumPy scalars and like Python bools + # check for `np.bool_` separately as NumPy<2 has no `np.bool` + if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_): + raise TypeError( + f"array cannot be filled with scalar of type {type(fill_val)}" + ) + + +def full( + shape, + fill_value, + *, + dtype=None, + order="C", + device=None, + usm_type=None, + sycl_queue=None, +): + """ + Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified + shape and filled with `fill_value`. + + Args: + shape (tuple): + Dimensions of the array to be created. + fill_value (int,float,complex,usm_ndarray): + fill value + dtype (optional): data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. Default: ``None`` + order ("C", or "F"): + memory layout for the array. Default: ``"C"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + New array initialized with given value. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + dpctl.utils.validate_usm_type(usm_type, allow_none=True) + + if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)): + if ( + isinstance(fill_value, dpt.usm_ndarray) + and sycl_queue is None + and device is None + ): + sycl_queue = fill_value.sycl_queue + else: + sycl_queue = normalize_queue_device( + sycl_queue=sycl_queue, device=device + ) + X = dpt.asarray( + fill_value, + dtype=dtype, + order=order, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + return dpt.copy(dpt.broadcast_to(X, shape), order=order) + else: + _validate_fill_value(fill_value) + + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + usm_type = usm_type if usm_type is not None else "device" + dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value)) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + fill_value = _cast_fill_val(fill_value, dtype) + + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp new file mode 100644 index 000000000000..dfd1b889aafe --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -0,0 +1,171 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor constructors. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/offset_utils.hpp" +#include "utils/strided_iters.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace constructors +{ + +using dpctl::tensor::ssize_t; + +/*! + @defgroup CtorKernels + */ + +template +class full_strided_kernel; + +using namespace dpctl::tensor::offset_utils; + +/* ================ Full ================== */ + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param fill_v Value to fill the array with + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue &q, + std::size_t nelems, + dstTy fill_v, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + dstTy *p = reinterpret_cast(dst_p); + cgh.fill(p, fill_v, nelems); + }); + + return fill_ev; +} + +template +class FullStridedFunctor +{ +private: + Ty *p = nullptr; + Ty fill_v; + IndexerT indexer; + +public: + FullStridedFunctor(Ty *p_, const Ty &fill_v_, const IndexerT &indexer_) + : p(p_), fill_v(fill_v_), indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + auto offset = indexer(id.get(0)); + p[offset] = fill_v; + } +}; + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nd Array dimensionality + * @param nelems Length of the sequence + * @param shape_strides Kernel accessible USM pointer to packed shape and + * strides of array. + * @param fill_v Value to fill the array with + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_strided_impl(sycl::queue &q, + int nd, + std::size_t nelems, + const ssize_t *shape_strides, + dstTy fill_v, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + dstTy *dst_tp = reinterpret_cast(dst_p); + + using dpctl::tensor::offset_utils::StridedIndexer; + const StridedIndexer strided_indexer(nd, 0, shape_strides); + + sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = full_strided_kernel; + using Impl = FullStridedFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(dst_tp, fill_v, strided_indexer)); + }); + + return fill_ev; +} + +} // namespace constructors +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp new file mode 100644 index 000000000000..e1f61be4a12a --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -0,0 +1,315 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "full_ctor.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &, + std::size_t, + const py::object &, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v = py::cast(py_value); + + sycl::event fill_ev; + + if constexpr (sizeof(dstTy) == sizeof(char)) { + const auto memset_val = sycl::bit_cast(fill_v); + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + bool is_zero = false; + if constexpr (sizeof(dstTy) == 1) { + is_zero = (std::uint8_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 2) { + is_zero = + (std::uint16_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 4) { + is_zero = + (std::uint32_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 8) { + is_zero = + (std::uint64_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 16) { + struct UInt128 + { + + constexpr UInt128() : v1{}, v2{} {} + UInt128(const UInt128 &) = default; + + operator bool() const + { + return bool(!v1) && bool(!v2); + } + + std::uint64_t v1; + std::uint64_t v2; + }; + is_zero = static_cast(sycl::bit_cast(fill_v)); + } + + if (is_zero) { + static constexpr int memset_val = 0; + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + using dpctl::tensor::kernels::constructors::full_contig_impl; + + fill_ev = + full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + } + } + + return fill_ev; +} + +template +struct FullContigFactory +{ + fnT get() + { + fnT f = full_contig_impl; + return f; + } +}; + +typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &, + int, + std::size_t, + py::ssize_t *, + const py::object &, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given strided memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nd Array dimensionality + * @param nelems Length of the sequence + * @param shape_strides Kernel accessible USM pointer to packed shape and + * strides of array. + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_strided_impl(sycl::queue &exec_q, + int nd, + std::size_t nelems, + py::ssize_t *shape_strides, + const py::object &py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v = py::cast(py_value); + + using dpctl::tensor::kernels::constructors::full_strided_impl; + sycl::event fill_ev = full_strided_impl( + exec_q, nd, nelems, shape_strides, fill_v, dst_p, depends); + + return fill_ev; +} + +template +struct FullStridedFactory +{ + fnT get() + { + fnT f = full_strided_impl; + return f; + } +}; + +static full_contig_fn_ptr_t full_contig_dispatch_vector[td_ns::num_types]; +static full_strided_fn_ptr_t full_strided_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_full(const py::object &py_value, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // py_value should be coercible into data type of dst + + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = full_contig_dispatch_vector[dst_typeid]; + + sycl::event full_contig_event = + fn(exec_q, static_cast(dst_nelems), py_value, dst_data, + depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {full_contig_event}), + full_contig_event); + } + else { + int nd = dst.get_ndim(); + auto const &dst_shape = dst.get_shape_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + auto fn = full_strided_dispatch_vector[dst_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, dst_shape, dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + py::ssize_t *shape_strides = shape_strides_owner.get(); + + const sycl::event &full_strided_ev = + fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data, + {copy_shape_ev}); + + // free shape_strides + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {full_strided_ev}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events), + full_strided_ev); + } +} + +void init_full_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(full_contig_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(full_strided_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp new file mode 100644 index 000000000000..d664b2013506 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp @@ -0,0 +1,60 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_full(const py::object &py_value, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_full_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index c18761031fd0..c72c0b49622a 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -54,7 +54,7 @@ // #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" // #include "eye_ctor.hpp" -// #include "full_ctor.hpp" +#include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" // #include "linear_sequences.hpp" @@ -103,7 +103,7 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ================ Full ================== */ -// using dpctl::tensor::py_internal::usm_ndarray_full; +using dpctl::tensor::py_internal::usm_ndarray_full; /* ================ Zeros ================== */ @@ -159,7 +159,7 @@ void init_dispatch_vectors(void) // init_copy_for_reshape_dispatch_vectors(); // init_copy_for_roll_dispatch_vectors(); // init_linear_sequences_dispatch_vectors(); - // init_full_ctor_dispatch_vectors(); + init_full_ctor_dispatch_vectors(); // init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); // init_triul_ctor_dispatch_vectors(); @@ -327,10 +327,10 @@ PYBIND11_MODULE(_tensor_impl, m) // "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), // py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_full_usm_ndarray", &usm_ndarray_full, - // "Populate usm_ndarray `dst` with given fill_value.", - // py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_full_usm_ndarray", &usm_ndarray_full, + "Populate usm_ndarray `dst` with given fill_value.", + py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); m.def("_take", &usm_ndarray_take, "Takes elements at usm_ndarray indices `ind` and axes starting " From d50f263f089dfd52edb4daa15edd3f86807965e5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:06:00 -0800 Subject: [PATCH 16/50] Use full and _full_usm_ndarray from dpctl_ext in dpnp --- dpnp/dpnp_algo/dpnp_fill.py | 6 ++++-- dpnp/dpnp_container.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index 112ea3af0fdb..f7e6f0f608b1 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -32,12 +32,14 @@ import dpctl.utils as dpu from dpctl.tensor._ctors import _cast_fill_val from dpctl.tensor._tensor_impl import ( - _copy_usm_ndarray_into_usm_ndarray, - _full_usm_ndarray, _zeros_usm_ndarray, ) import dpnp +from dpctl_ext.tensor._tensor_impl import ( + _copy_usm_ndarray_into_usm_ndarray, + _full_usm_ndarray, +) def dpnp_fill(arr, val): diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index 4975db17c717..b13bf96cda28 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -38,6 +38,7 @@ import dpctl.tensor as dpt import dpctl.utils as dpu +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_array import dpnp_array @@ -228,7 +229,7 @@ def full( fill_value = fill_value.get_array() """Creates `dpnp_array` having a specified shape, filled with fill_value.""" - array_obj = dpt.full( + array_obj = dpt_ext.full( shape, fill_value, dtype=dtype, From f189dc540477ceadf35dcb127325056c5e0c406b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:22:55 -0800 Subject: [PATCH 17/50] Update .gitignore to ignore .so files in dpctl_ext --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5d2725d3186f..4ae07ccbbdb9 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ dpnp/**/*.cpython*.so dpnp/**/*.pyd *~ core + +dpctl_ext/**/*.cpython*.so From f9a181721784c843907c16e2e1d5569c487cf9e3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:23:51 -0800 Subject: [PATCH 18/50] Move _zeros_usm_ndarray to dpctl_ext --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../tensor/libtensor/source/tensor_ctors.cpp | 12 +- .../tensor/libtensor/source/zeros_ctor.cpp | 168 ++++++++++++++++++ .../tensor/libtensor/source/zeros_ctor.hpp | 59 ++++++ 4 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 0c52d766afbf..cb468b9a226d 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -53,7 +53,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index c72c0b49622a..b55439162f90 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -64,7 +64,7 @@ #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" // #include "where.hpp" -// #include "zeros_ctor.hpp" +#include "zeros_ctor.hpp" namespace py = pybind11; @@ -107,7 +107,7 @@ using dpctl::tensor::py_internal::usm_ndarray_full; /* ================ Zeros ================== */ -// using dpctl::tensor::py_internal::usm_ndarray_zeros; +using dpctl::tensor::py_internal::usm_ndarray_zeros; /* ============== Advanced Indexing ============= */ using dpctl::tensor::py_internal::usm_ndarray_put; @@ -160,7 +160,7 @@ void init_dispatch_vectors(void) // init_copy_for_roll_dispatch_vectors(); // init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); - // init_zeros_ctor_dispatch_vectors(); + init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); // init_triul_ctor_dispatch_vectors(); @@ -323,9 +323,9 @@ PYBIND11_MODULE(_tensor_impl, m) // synchronously.", py::arg("src"), py::arg("dst"), // py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, - // "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, + "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); m.def("_full_usm_ndarray", &usm_ndarray_full, "Populate usm_ndarray `dst` with given fill_value.", diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp new file mode 100644 index 000000000000..4558743b3c22 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -0,0 +1,168 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "zeros_ctor.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &, + std::size_t, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with zeros. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event zeros_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + char *dst_p, + const std::vector &depends) +{ + + static constexpr int memset_val(0); + sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + + return fill_ev; +} + +template +struct ZerosContigFactory +{ + fnT get() + { + fnT f = zeros_contig_impl; + return f; + } +}; + +static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = zeros_contig_dispatch_vector[dst_typeid]; + + sycl::event zeros_contig_event = + fn(exec_q, static_cast(dst_nelems), dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {zeros_contig_event}), + zeros_contig_event); + } + else { + throw std::runtime_error( + "Only population of contiguous usm_ndarray objects is supported."); + } +} + +void init_zeros_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(zeros_contig_dispatch_vector); + + return; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp new file mode 100644 index 000000000000..51270a3443cc --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp @@ -0,0 +1,59 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_zeros_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From 4b8505acf111ec2636afa0d2a9a25cf8677e02c7 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:25:05 -0800 Subject: [PATCH 19/50] Use _zeros_usm_ndarray from dpctl_ext in dpnp_fill.py --- dpnp/dpnp_algo/dpnp_fill.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index f7e6f0f608b1..0d6640c3b8b5 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -31,14 +31,12 @@ import dpctl.tensor as dpt import dpctl.utils as dpu from dpctl.tensor._ctors import _cast_fill_val -from dpctl.tensor._tensor_impl import ( - _zeros_usm_ndarray, -) import dpnp from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _full_usm_ndarray, + _zeros_usm_ndarray, ) From 61106b2e208d7f331bebc3335a49bc23212510c1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:39:35 -0800 Subject: [PATCH 20/50] Move linear-sequence implementations to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/constructors.hpp | 178 ++++++++++ .../libtensor/source/linear_sequences.cpp | 312 ++++++++++++++++++ .../libtensor/source/linear_sequences.hpp | 69 ++++ .../tensor/libtensor/source/tensor_ctors.cpp | 38 +-- 5 files changed, 579 insertions(+), 20 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index cb468b9a226d..af0e2a7aa49f 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -48,7 +48,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index dfd1b889aafe..20775b071ea8 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -58,11 +58,189 @@ using dpctl::tensor::ssize_t; @defgroup CtorKernels */ +template +class linear_sequence_step_kernel; +template +class linear_sequence_affine_kernel; template class full_strided_kernel; +// template class eye_kernel; using namespace dpctl::tensor::offset_utils; +template +class LinearSequenceStepFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty step_v; + +public: + LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) + : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + p[i] = Ty{start_v.real() + i * step_v.real(), + start_v.imag() + i * step_v.imag()}; + } + else { + p[i] = start_v + i * step_v; + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting value and + * increment. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start_v Typed starting value of the sequence + * @param step_v Typed increment of the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + std::size_t nelems, + Ty start_v, + Ty step_v, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceStepFunctor(array_data, start_v, step_v)); + }); + + return lin_space_step_event; +} + +// Constructor to populate tensor with linear sequence defined by +// start and and data + +template +class LinearSequenceAffineFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty end_v; + std::size_t n; + +public: + LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), + n((den == 0) ? 1 : den) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + wTy wc = wTy(i) / n; + wTy w = wTy(n - i) / n; + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using reT = typename Ty::value_type; + auto _w = static_cast(w); + auto _wc = static_cast(wc); + auto re_comb = sycl::fma(start_v.real(), _w, reT(0)); + re_comb = + sycl::fma(end_v.real(), _wc, + re_comb); // start_v.real() * _w + end_v.real() * _wc; + auto im_comb = + sycl::fma(start_v.imag(), _w, + reT(0)); // start_v.imag() * _w + end_v.imag() * _wc; + im_comb = sycl::fma(end_v.imag(), _wc, im_comb); + Ty affine_comb = Ty{re_comb, im_comb}; + p[i] = affine_comb; + } + else if constexpr (std::is_floating_point::value) { + Ty _w = static_cast(w); + Ty _wc = static_cast(wc); + auto affine_comb = + sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc; + affine_comb = sycl::fma(end_v, _wc, affine_comb); + p[i] = affine_comb; + } + else { + using dpctl::tensor::type_utils::convert_impl; + auto affine_comb = start_v * w + end_v * wc; + p[i] = convert_impl(affine_comb); + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting and end values. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence. + * @param start_v Stating value of the sequence. + * @param end_v End-value of the sequence. + * @param include_endpoint Whether the end-value is included in the sequence. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + std::size_t nelems, + Ty start_v, + Ty end_v, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const bool device_supports_doubles = + exec_q.get_device().has(sycl::aspect::fp64); + const std::size_t den = (include_endpoint) ? nelems - 1 : nelems; + + sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + if (device_supports_doubles) { + using KernelName = linear_sequence_affine_kernel; + using Impl = LinearSequenceAffineFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start_v, end_v, den)); + } + else { + using KernelName = linear_sequence_affine_kernel; + using Impl = LinearSequenceAffineFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start_v, end_v, den)); + } + }); + + return lin_space_affine_event; +} + /* ================ Full ================== */ /*! diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp new file mode 100644 index 000000000000..02c4a8ad0fa1 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp @@ -0,0 +1,312 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpnp4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "linear_sequences.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +// Constructor to populate tensor with linear sequence defined by +// start and step data + +typedef sycl::event (*lin_space_step_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + const py::object &start, + const py::object &step, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting value and increment + * given as Python objects. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start Starting value of the sequence as Python object. Must be + * convertible to array element data type `Ty`. + * @param step Increment of the sequence as Python object. Must be convertible + * to array element data type `Ty`. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &start, + const py::object &step, + char *array_data, + const std::vector &depends) +{ + Ty start_v = py::cast(start); + Ty step_v = py::cast(step); + + using dpctl::tensor::kernels::constructors::lin_space_step_impl; + + auto lin_space_step_event = lin_space_step_impl( + exec_q, nelems, start_v, step_v, array_data, depends); + + return lin_space_step_event; +} + +typedef sycl::event (*lin_space_affine_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + const py::object &start, + const py::object &end, + bool include_endpoint, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting and end values given + * as Python objects. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param start Stating value of the sequence as Python object. Must be + * convertible to array data element type `Ty`. + * @param end End-value of the sequence as Python object. Must be convertible + * to array data element type `Ty`. + * @param include_endpoint Whether the end-value is included in the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &start, + const py::object &end, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + Ty start_v = py::cast(start); + Ty end_v = py::cast(end); + + using dpctl::tensor::kernels::constructors::lin_space_affine_impl; + + auto lin_space_affine_event = lin_space_affine_impl( + exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); + + return lin_space_affine_event; +} + +using dpctl::utils::keep_args_alive; + +static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types]; + +static lin_space_affine_fn_ptr_t + lin_space_affine_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_linear_sequence_step(const py::object &start, + const py::object &dt, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_step_event; + + auto fn = lin_space_step_dispatch_vector[dst_typeid]; + + linspace_step_event = + fn(exec_q, static_cast(len), start, dt, dst_data, depends); + + return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), + linspace_step_event); +} + +std::pair + usm_ndarray_linear_sequence_affine(const py::object &start, + const py::object &end, + const dpctl::tensor::usm_ndarray &dst, + bool include_endpoint, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue context is not the same as allocation context"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_affine_event; + + auto fn = lin_space_affine_dispatch_vector[dst_typeid]; + + linspace_affine_event = fn(exec_q, static_cast(len), start, + end, include_endpoint, dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {linspace_affine_event}), + linspace_affine_event); +} + +/*! + * @brief Factor to get function pointer of type `fnT` for array with elements + * of type `Ty`. + * @defgroup CtorKernels + */ +template +struct LinSpaceStepFactory +{ + fnT get() + { + fnT f = lin_space_step_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for array data type + * `Ty`. + */ +template +struct LinSpaceAffineFactory +{ + fnT get() + { + fnT f = lin_space_affine_impl; + return f; + } +}; + +void init_linear_sequences_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp new file mode 100644 index 000000000000..321cd2f23efe --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp @@ -0,0 +1,69 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair usm_ndarray_linear_sequence_step( + const py::object &start, + const py::object &dt, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern std::pair usm_ndarray_linear_sequence_affine( + const py::object &start, + const py::object &end, + const dpctl::tensor::usm_ndarray &dst, + bool include_endpoint, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_linear_sequences_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index b55439162f90..dd660c497f9a 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -57,7 +57,7 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" -// #include "linear_sequences.hpp" +#include "linear_sequences.hpp" // #include "repeat.hpp" #include "simplify_iteration_space.hpp" // #include "triul_ctor.hpp" @@ -98,8 +98,8 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ============= linear-sequence ==================== */ -// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; -// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; /* ================ Full ================== */ @@ -158,7 +158,7 @@ void init_dispatch_vectors(void) init_copy_as_contig_dispatch_vectors(); // init_copy_for_reshape_dispatch_vectors(); // init_copy_for_roll_dispatch_vectors(); - // init_linear_sequences_dispatch_vectors(); + init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); @@ -300,22 +300,22 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = // py::list()); - // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, - // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - // "specified by " - // "starting point `start` and step `dt`. " - // "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("start"), py::arg("dt"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + "specified by " + "starting point `start` and step `dt`. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("start"), py::arg("dt"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, - // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - // "specified by " - // "starting point `start` and end point `end`. " - // "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("start"), py::arg("end"), py::arg("dst"), - // py::arg("include_endpoint"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + "specified by " + "starting point `start` and end point `end`. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("start"), py::arg("end"), py::arg("dst"), + py::arg("include_endpoint"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("_copy_numpy_ndarray_into_usm_ndarray", // ©_numpy_ndarray_into_usm_ndarray, From a030579be8525d6f23674d5c9a4a171ab842f500 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:40:33 -0800 Subject: [PATCH 21/50] Use _tensor_impl from dpctl_ext in dpnp_utils_fft.py --- dpnp/fft/dpnp_utils_fft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index 4e2b7aaaf842..c692774a424f 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -42,7 +42,6 @@ from collections.abc import Sequence import dpctl -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( @@ -51,6 +50,7 @@ ) from dpctl.utils import ExecutionPlacementError +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.fft._fft_impl as fi From a1d6fa39ba8607b191177d6acb0ca2f3cf8f49fc Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 03:03:08 -0800 Subject: [PATCH 22/50] Move tril()/triu() to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_ctors.py | 157 +++++++++++ .../include/kernels/constructors.hpp | 138 ++++++++++ .../tensor/libtensor/source/tensor_ctors.cpp | 46 ++-- .../tensor/libtensor/source/triul_ctor.cpp | 253 ++++++++++++++++++ .../tensor/libtensor/source/triul_ctor.hpp | 62 +++++ 7 files changed, 638 insertions(+), 24 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index af0e2a7aa49f..1375c8316754 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -54,7 +54,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 9f4c27608a99..3c6939eff7a0 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -29,6 +29,8 @@ from dpctl_ext.tensor._ctors import ( full, + tril, + triu, ) from dpctl_ext.tensor._indexing_functions import ( put, @@ -39,4 +41,6 @@ "full", "put", "take", + "tril", + "triu", ] diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 5caa07099c56..a0e7b28e66ff 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -26,6 +26,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import operator from numbers import Number import dpctl @@ -167,3 +168,159 @@ def full( hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) _manager.add_event_pair(hev, full_ev) return res + + +def tril(x, /, *, k=0): + """ + Returns the lower triangular part of a matrix (or a stack of matrices) + ``x``. + + The lower triangular part of the matrix is defined as the elements on and + below the specified diagonal ``k``. + + Args: + x (usm_ndarray): + Input array + k (int, optional): + Specifies the diagonal above which to set + elements to zero. If ``k = 0``, the diagonal is the main diagonal. + If ``k < 0``, the diagonal is below the main diagonal. + If ``k > 0``, the diagonal is above the main diagonal. + Default: ``0`` + + Returns: + usm_ndarray: + A lower-triangular array or a stack of lower-triangular arrays. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected argument of type dpctl.tensor.usm_ndarray, " + f"got {type(x)}." + ) + + k = operator.index(k) + + order = "F" if (x.flags.f_contiguous) else "C" + + shape = x.shape + nd = x.ndim + if nd < 2: + raise ValueError("Array dimensions less than 2.") + + q = x.sycl_queue + if k >= shape[nd - 1] - 1: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=res, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + elif k < -shape[nd - 2]: + res = dpt.zeros( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + else: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, tril_ev = ti._tril( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, tril_ev) + + return res + + +def triu(x, /, *, k=0): + """ + Returns the upper triangular part of a matrix (or a stack of matrices) + ``x``. + + The upper triangular part of the matrix is defined as the elements on and + above the specified diagonal ``k``. + + Args: + x (usm_ndarray): + Input array + k (int, optional): + Specifies the diagonal below which to set + elements to zero. If ``k = 0``, the diagonal is the main diagonal. + If ``k < 0``, the diagonal is below the main diagonal. + If ``k > 0``, the diagonal is above the main diagonal. + Default: ``0`` + + Returns: + usm_ndarray: + An upper-triangular array or a stack of upper-triangular arrays. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected argument of type dpctl.tensor.usm_ndarray, " + f"got {type(x)}." + ) + + k = operator.index(k) + + order = "F" if (x.flags.f_contiguous) else "C" + + shape = x.shape + nd = x.ndim + if nd < 2: + raise ValueError("Array dimensions less than 2.") + + q = x.sycl_queue + if k > shape[nd - 1]: + res = dpt.zeros( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + elif k <= -shape[nd - 2] + 1: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=res, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + else: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, triu_ev = ti._triu( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, triu_ev) + + return res diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 20775b071ea8..8d53655b2754 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -343,6 +343,144 @@ sycl::event full_strided_impl(sycl::queue &q, return fill_ev; } +/* =========================== Tril and triu ============================== */ + +// define function type +typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &, + ssize_t, // inner_range //ssize_t + ssize_t, // outer_range + char *, // src_data_ptr + char *, // dst_data_ptr + ssize_t, // nd + ssize_t *, // shape_and_strides + ssize_t, // k + const std::vector &, + const std::vector &); + +/*! + * @brief Function to copy triangular matrices from source stack to destination + * stack. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param inner_range Number of elements in each matrix. + * @param outer_range Number of matrices to copy. + * @param src_p Kernel accessible USM pointer for the source array. + * @param dst_p Kernel accessible USM pointer for the destination array. + * @param nd The array dimensionality of source and destination arrays. + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides of arrays. + * @param k Position of the diagonal above/below which to copy filling the rest + * with zero elements. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +class tri_kernel; +template +sycl::event tri_impl(sycl::queue &exec_q, + ssize_t inner_range, + ssize_t outer_range, + char *src_p, + char *dst_p, + ssize_t nd, + ssize_t *shape_and_strides, + ssize_t k, + const std::vector &depends, + const std::vector &additional_depends) +{ + static constexpr int d2 = 2; + ssize_t src_s = nd; + ssize_t dst_s = 2 * nd; + ssize_t nd_1 = nd - 1; + ssize_t nd_2 = nd - 2; + Ty *src = reinterpret_cast(src_p); + Ty *dst = reinterpret_cast(dst_p); + + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + cgh.parallel_for>( + sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) { + ssize_t outer_gid = idx[0] / inner_range; + ssize_t inner_gid = idx[0] - inner_range * outer_gid; + + ssize_t src_inner_offset = 0, dst_inner_offset = 0; + bool to_copy{false}; + + { + using dpctl::tensor::strides::CIndexer_array; + CIndexer_array indexer_i( + {shape_and_strides[nd_2], shape_and_strides[nd_1]}); + indexer_i.set(inner_gid); + const std::array &inner = indexer_i.get(); + src_inner_offset = + inner[0] * shape_and_strides[src_s + nd_2] + + inner[1] * shape_and_strides[src_s + nd_1]; + dst_inner_offset = + inner[0] * shape_and_strides[dst_s + nd_2] + + inner[1] * shape_and_strides[dst_s + nd_1]; + + if constexpr (upper) + to_copy = (inner[0] + k >= inner[1]); + else + to_copy = (inner[0] + k <= inner[1]); + } + + ssize_t src_offset = 0; + ssize_t dst_offset = 0; + { + using dpctl::tensor::strides::CIndexer_vector; + CIndexer_vector outer(nd - d2); + outer.get_displacement( + outer_gid, shape_and_strides, shape_and_strides + src_s, + shape_and_strides + dst_s, src_offset, dst_offset); + } + + src_offset += src_inner_offset; + dst_offset += dst_inner_offset; + + dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0); + }); + }); + return tri_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template +struct TrilGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template +struct TriuGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + } // namespace constructors } // namespace kernels } // namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index dd660c497f9a..f2afce105f7f 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -60,7 +60,7 @@ #include "linear_sequences.hpp" // #include "repeat.hpp" #include "simplify_iteration_space.hpp" -// #include "triul_ctor.hpp" +#include "triul_ctor.hpp" #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" // #include "where.hpp" @@ -129,7 +129,7 @@ using dpctl::tensor::py_internal::usm_ndarray_take; /* =========================== Tril and triu ============================== */ -// using dpctl::tensor::py_internal::usm_ndarray_triul; +using dpctl::tensor::py_internal::usm_ndarray_triul; /* =========================== Where ============================== */ @@ -162,7 +162,7 @@ void init_dispatch_vectors(void) init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); - // init_triul_ctor_dispatch_vectors(); + init_triul_ctor_dispatch_vectors(); // populate_masked_extract_dispatch_vectors(); // populate_masked_place_dispatch_vectors(); @@ -388,27 +388,27 @@ PYBIND11_MODULE(_tensor_impl, m) dpctl::tensor::py_internal::default_device_index_type, "Gives default index type supported by device.", py::arg("dev")); - // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, - // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, - // sycl::queue &exec_q, - // const std::vector depends) - // -> std::pair { - // return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); - // }; - // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), - // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); + }; + m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), + py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + py::arg("depends") = py::list()); - // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src, - // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, - // sycl::queue &exec_q, - // const std::vector depends) - // -> std::pair { - // return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); - // }; - // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), - // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + auto triu_fn = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); + }; + m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), + py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), // py::arg("cumsum"), py::arg("sycl_queue"), diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp new file mode 100644 index 000000000000..0890dfdb4766 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp @@ -0,0 +1,253 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include // for std::copy +#include // for std::size_t +#include // for std::make_shared +#include // for std::runtime_error +#include // for std::pair, std::move +#include // for std::vector, std::begin, std::end + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/constructors.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +using dpctl::tensor::kernels::constructors::tri_fn_ptr_t; + +static tri_fn_ptr_t tril_generic_dispatch_vector[td_ns::num_types]; +static tri_fn_ptr_t triu_generic_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_triul(sycl::queue &exec_q, + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + char part, + py::ssize_t k = 0, + const std::vector &depends = {}) +{ + // array dimensions must be the same + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("Array dimensions are not the same."); + } + + if (src_nd < 2) { + throw py::value_error("Array dimensions less than 2."); + } + + // shapes must be the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; shapes_equal && i < src_nd; ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + // TODO: could use a temporary, but this is done by the caller + throw py::value_error("Arrays index overlapping segments of memory"); + } + + auto array_types = td_ns::usm_ndarray_types(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (dst_typeid != src_typeid) { + throw py::value_error("Array dtype are not the same."); + } + + // check same queues + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue context is not the same as allocation contexts"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto src_strides = src.get_strides_vector(); + auto dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd - 2; + const py::ssize_t *shape = src_shape; + + const shT iter_src_strides(std::begin(src_strides), + std::begin(src_strides) + nd); + const shT iter_dst_strides(std::begin(dst_strides), + std::begin(dst_strides) + nd); + + simplify_iteration_space(nd, shape, iter_src_strides, iter_dst_strides, + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (src_offset != 0 || dst_offset != 0) { + throw py::value_error("Reversed slice for dst is not supported"); + } + + nd += 2; + + using usm_host_allocatorT = + dpctl::tensor::alloc_utils::usm_host_allocator; + using usmshT = std::vector; + + usm_host_allocatorT allocator(exec_q); + auto shp_host_shape_and_strides = + std::make_shared(3 * nd, allocator); + + std::copy(simplified_shape.begin(), simplified_shape.end(), + shp_host_shape_and_strides->begin()); + (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2]; + (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1]; + + std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), + shp_host_shape_and_strides->begin() + nd); + (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2]; + (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1]; + + std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), + shp_host_shape_and_strides->begin() + 2 * nd); + (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2]; + (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1]; + + auto dev_shape_and_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(3 * nd, + exec_q); + py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get(); + + const sycl::event ©_shape_and_strides = exec_q.copy( + shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd); + + py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2]; + py::ssize_t outer_range = src_nelems / inner_range; + + sycl::event tri_ev; + if (part == 'l') { + auto fn = tril_generic_dispatch_vector[src_typeid]; + tri_ev = + fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, + dev_shape_and_strides, k, depends, {copy_shape_and_strides}); + } + else { + auto fn = triu_generic_dispatch_vector[src_typeid]; + tri_ev = + fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, + dev_shape_and_strides, k, depends, {copy_shape_and_strides}); + } + + const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(tri_ev); + const auto &ctx = exec_q.get_context(); + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + cgh.host_task( + [shp_host_shape_and_strides = std::move(shp_host_shape_and_strides), + dev_shape_and_strides, ctx]() { + // capture of shp_host_shape_and_strides ensure the underlying + // vector exists for the entire execution of copying kernel + sycl_free_noexcept(dev_shape_and_strides, ctx); + }); + }); + // since host_task now owns USM allocation, release ownership by smart + // pointer + dev_shape_and_strides_owner.release(); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev); +} + +void init_triul_ctor_dispatch_vectors(void) +{ + + using namespace td_ns; + using dpctl::tensor::kernels::constructors::TrilGenericFactory; + using dpctl::tensor::kernels::constructors::TriuGenericFactory; + + DispatchVectorBuilder dvb1; + dvb1.populate_dispatch_vector(tril_generic_dispatch_vector); + + DispatchVectorBuilder dvb2; + dvb2.populate_dispatch_vector(triu_generic_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp new file mode 100644 index 000000000000..08889df6227f --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp @@ -0,0 +1,62 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_triul(sycl::queue &exec_q, + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + char part, + py::ssize_t k = 0, + const std::vector &depends = {}); + +extern void init_triul_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From f1d6e5650910eec6f330b2de902a93a1ae95df5f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 03:05:03 -0800 Subject: [PATCH 23/50] Use tril/triu/_tril from dpctl_ext.tensor in dpnp --- dpnp/dpnp_container.py | 4 ++-- dpnp/linalg/dpnp_utils_linalg.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index b13bf96cda28..c8e28529cd57 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -270,13 +270,13 @@ def ones( def tril(x1, /, *, k=0): """Creates `dpnp_array` as lower triangular part of an input array.""" - array_obj = dpt.tril(dpnp.get_usm_ndarray(x1), k=k) + array_obj = dpt_ext.tril(dpnp.get_usm_ndarray(x1), k=k) return dpnp_array._create_from_usm_ndarray(array_obj) def triu(x1, /, *, k=0): """Creates `dpnp_array` as upper triangular part of an input array.""" - array_obj = dpt.triu(dpnp.get_usm_ndarray(x1), k=k) + array_obj = dpt_ext.triu(dpnp.get_usm_ndarray(x1), k=k) return dpnp_array._create_from_usm_ndarray(array_obj) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 196cd2ae9da5..5fb1c099dde2 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -42,12 +42,12 @@ from typing import NamedTuple -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import normalize_axis_index from numpy import prod +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li from dpnp.dpnp_utils import get_usm_allocations From 668079060d9ece02fbb6887c2313edca9e6ecbef Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 9 Feb 2026 02:47:35 -0800 Subject: [PATCH 24/50] Disable pylint no-name-in-module for dpctl_ext --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 1 + dpnp/dpnp_iface.py | 3 +-- dpnp/dpnp_iface_searching.py | 1 + dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index b63bf61f8dad..d8235b84e2d0 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -44,6 +44,7 @@ _validate_dtype, ) +# pylint: disable=no-name-in-module import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 832446c826ba..6220c61db6d9 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -40,6 +40,7 @@ """ # pylint: disable=protected-access +# pylint: disable=no-name-in-module import os @@ -53,8 +54,6 @@ import dpnp from .dpnp_array import dpnp_array - -# pylint: disable=no-name-in-module from .dpnp_utils import ( dpnp_descriptor, map_dtype_to_device, diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index fdbd317d31dd..74fbc9b37d13 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -41,6 +41,7 @@ import dpctl.tensor as dpt +# pylint: disable=no-name-in-module import dpctl_ext.tensor._tensor_impl as dti import dpnp diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 4d8e3cdfbd0d..2de2bc15372c 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -37,6 +37,7 @@ ) from dpctl.utils import ExecutionPlacementError +# pylint: disable=no-name-in-module import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.blas._blas_impl as bi From 263b7175f4aab799cd4fa100602011e8e23d046b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 04:31:01 -0800 Subject: [PATCH 25/50] Add TODO comments --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 2 ++ dpnp/dpnp_iface.py | 2 ++ dpnp/dpnp_iface_searching.py | 2 ++ dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 ++ dpnp/scipy/linalg/_utils.py | 2 ++ setup.py | 2 +- 6 files changed, 11 insertions(+), 1 deletion(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index d8235b84e2d0..88abcee5035c 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -45,6 +45,8 @@ ) # pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 6220c61db6d9..50b474014666 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -50,6 +50,8 @@ import numpy from dpctl.tensor._device import normalize_queue_device +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index 74fbc9b37d13..16ab633d506b 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -42,6 +42,8 @@ import dpctl.tensor as dpt # pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as dti import dpnp diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 2de2bc15372c..3dfd3c23ee7f 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -38,6 +38,8 @@ from dpctl.utils import ExecutionPlacementError # pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.blas._blas_impl as bi diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py index 8eb9187236bf..ce832d8f4529 100644 --- a/dpnp/scipy/linalg/_utils.py +++ b/dpnp/scipy/linalg/_utils.py @@ -44,6 +44,8 @@ import dpctl.utils as dpu +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li diff --git a/setup.py b/setup.py index a0c54b066dcf..7ffef3bed9d8 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ "dpnp.scipy", "dpnp.scipy.linalg", "dpnp.scipy.special", - # dpctl_ext + # TODO: replace with dpctl; dpctl.tensor "dpctl_ext", "dpctl_ext.tensor", ], From 4130c1b80aa108ca127040a6c4ea15bcaa86173f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 04:53:39 -0800 Subject: [PATCH 26/50] Use default_device_complex_type from dpctl_ext on test_array_api_info.py --- dpnp/tests/test_array_api_info.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dpnp/tests/test_array_api_info.py b/dpnp/tests/test_array_api_info.py index b310192ffc59..32730c8724dc 100644 --- a/dpnp/tests/test_array_api_info.py +++ b/dpnp/tests/test_array_api_info.py @@ -1,9 +1,11 @@ -import numpy import pytest from dpctl import SyclDeviceCreationError, get_devices, select_default_device -from dpctl.tensor._tensor_impl import default_device_complex_type import dpnp + +# TODO: revert to `from dpctl.tensor....` +# when dpnp fully migrates dpctl/tensor +from dpctl_ext.tensor._tensor_impl import default_device_complex_type from dpnp.tests.helper import ( has_support_aspect64, is_win_platform, From 17ca9ab52368f3bbdbfbdf6410b82823c98c53c0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 06:59:55 -0800 Subject: [PATCH 27/50] Remove unused build_dpctl_ext function --- dpctl_ext/CMakeLists.txt | 80 ---------------------------------------- 1 file changed, 80 deletions(-) diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt index bb33a4f57332..cdb007a2d230 100644 --- a/dpctl_ext/CMakeLists.txt +++ b/dpctl_ext/CMakeLists.txt @@ -122,84 +122,4 @@ set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) set(CMAKE_INSTALL_RPATH "$ORIGIN") -function(build_dpctl_ext _trgt _src _dest) - set(options SYCL) - cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN}) - add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src) - set(_cythonize_trgt "${_trgt}_cythonize_pyx") - python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src}) - if(BUILD_DPCTL_EXT_SYCL) - add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src}) - target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int) - target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel) - if(DPCTL_OFFLOAD_COMPRESS) - target_link_options(${_trgt} PRIVATE --offload-compress) - endif() - if(_dpctl_sycl_targets) - # make fat binary - target_compile_options( - ${_trgt} - PRIVATE ${_dpctl_sycl_target_compile_options} - ) - target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options}) - endif() - endif() - target_link_libraries(${_trgt} PRIVATE Python::NumPy) - if(DPCTL_GENERATE_COVERAGE) - target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1) - if(BUILD_DPCTL_EXT_SYCL) - target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer) - endif() - endif() - target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface) - set(_linker_options "LINKER:${DPCTL_LDFLAGS}") - target_link_options(${_trgt} PRIVATE ${_linker_options}) - get_filename_component(_name_wle ${_generated_src} NAME_WLE) - get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY) - set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h") - set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h") - - # TODO: create separate folder inside build folder that contains only - # headers related to this target and appropriate folder structure to - # eliminate shadow dependencies - get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY) - # TODO: do not set directory if we did not generate header - target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir}) - set(_rpath_value "$ORIGIN") - if(BUILD_DPCTL_EXT_RELATIVE_PATH) - set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}") - endif() - if(DPCTL_WITH_REDIST) - set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..") - endif() - set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value}) - - install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest}) - install( - FILES ${_generated_api_h} - # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` - DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} - OPTIONAL - ) - install( - FILES ${_generated_public_h} - # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` - DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} - OPTIONAL - ) - if(DPCTL_GENERATE_COVERAGE) - get_filename_component(_original_src_dir ${_src} DIRECTORY) - file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir}) - install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir}) - endif() - - # Create target with headers only, because python is managing all the - # library imports at runtime - set(_trgt_headers ${_trgt}_headers) - add_library(${_trgt_headers} INTERFACE) - add_dependencies(${_trgt_headers} ${_trgt}) - get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES) - target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir}) -endfunction() - add_subdirectory(tensor) From 79cb2a45f28f5099701c0728a6def5c8961c5279 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 07:51:45 -0800 Subject: [PATCH 28/50] Apply remarks for CMake files --- dpctl_ext/CMakeLists.txt | 10 ++------- dpctl_ext/tensor/CMakeLists.txt | 38 ++++++++++++++++++--------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt index cdb007a2d230..e58693091422 100644 --- a/dpctl_ext/CMakeLists.txt +++ b/dpctl_ext/CMakeLists.txt @@ -27,13 +27,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -find_package(Python REQUIRED COMPONENTS NumPy) - -# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present) -# -w is to set working directory (and correctly set __pyx_f[] array of filenames) -set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"") -find_package(Cython REQUIRED) - +# TODO: rework this logic to remove current duplication if(WIN32) string( CONCAT WARNING_FLAGS @@ -118,7 +112,7 @@ else() endif() # at build time create include/ directory and copy header files over -set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) +# set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) set(CMAKE_INSTALL_RPATH "$ORIGIN") diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ee8da2e49506..28e7a4cb55f4 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -27,8 +27,10 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +find_package(Python COMPONENTS Development) + if(WIN32) - if(${CMAKE_VERSION} VERSION_LESS "3.23") + if(${CMAKE_VERSION} VERSION_LESS "3.27") # this is a work-around for target_link_options inserting option after -link option, cause # linker to ignore it. set(CMAKE_CXX_LINK_FLAGS @@ -37,6 +39,7 @@ if(WIN32) endif() endif() +# TODO: reuse this library for dpnp ufunc extension build set(_static_lib_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp ) @@ -67,11 +70,11 @@ add_library(${_static_lib_trgt} STATIC ${_static_lib_sources}) target_include_directories( ${_static_lib_trgt} PRIVATE - ${Python_INCLUDE_DIRS} - ${DPCTL_INCLUDE_DIR} + # ${Python_INCLUDE_DIRS} + # ${Dpctl_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include ) -target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES}) +target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Python) set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON) set(_py_trgts) @@ -94,14 +97,14 @@ set(_no_fast_math_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ) -list( - APPEND _no_fast_math_sources - # ${_elementwise_sources} - # ${_reduction_sources} - # ${_sorting_sources} - # ${_linalg_sources} - # ${_accumulator_sources} -) +#list( +#APPEND _no_fast_math_sources +# ${_elementwise_sources} +# ${_reduction_sources} +# ${_sorting_sources} +# ${_linalg_sources} +# ${_accumulator_sources} +#) foreach(_src_fn ${_no_fast_math_sources}) get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) @@ -114,7 +117,7 @@ endforeach() set(_compiler_definitions "") -set(_linker_options "LINKER:${DPCTL_LDFLAGS}") +set(_linker_options "LINKER:${DPNP_LDFLAGS}") foreach(python_module_name ${_py_trgts}) target_compile_options( ${python_module_name} @@ -124,6 +127,7 @@ foreach(python_module_name ${_py_trgts}) ${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel ) + # TODO: expand DPCTL_OFFLOAD_COMPRESS to the whole dpnp level if(DPCTL_OFFLOAD_COMPRESS) target_link_options(${python_module_name} PRIVATE --offload-compress) endif() @@ -149,22 +153,22 @@ foreach(python_module_name ${_py_trgts}) PRIVATE -fprofile-instr-generate -fcoverage-mapping ) endif() - if(_dpctl_sycl_targets) + if(_dpnp_sycl_targets) # make fat binary target_compile_options( ${python_module_name} - PRIVATE ${_dpctl_sycl_target_compile_options} + PRIVATE ${_dpnp_sycl_target_compile_options} ) target_link_options( ${python_module_name} - PRIVATE ${_dpctl_sycl_target_link_options} + PRIVATE ${_dpnp_sycl_target_link_options} ) endif() # TODO: update source so they reference individual libraries instead of # dpctl4pybind11.hpp. It will allow to simplify dependency tree # NOTE: dpctl C-API is resolved at runtime via Python # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI) - if(DPCTL_WITH_REDIST) + if(DPNP_WITH_REDIST) set_target_properties( ${python_module_name} PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.." From 4bf080edc0e5d277441fe39b31733571fbad0de3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 08:30:03 -0800 Subject: [PATCH 29/50] Apply remarks for c++ files --- .../include/kernels/copy_and_cast.hpp | 18 ++++----------- .../include/kernels/copy_as_contiguous.hpp | 19 ++++----------- .../source/copy_and_cast_usm_to_usm.cpp | 23 ++++--------------- .../source/copy_and_cast_usm_to_usm.hpp | 11 ++------- .../libtensor/source/copy_as_contig.cpp | 14 ++++------- .../libtensor/source/copy_as_contig.hpp | 11 ++------- .../source/device_support_queries.cpp | 13 ++++------- .../source/device_support_queries.hpp | 12 ++-------- .../source/simplify_iteration_space.cpp | 12 ++++------ .../source/simplify_iteration_space.hpp | 11 +++------ .../tensor/libtensor/source/tensor_ctors.cpp | 10 ++++---- 11 files changed, 43 insertions(+), 111 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp index a07d311a7fcb..d6001a11e471 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -33,11 +33,12 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include +#include #include #include #include -#include +#include #include "dpctl_tensor_types.hpp" #include "kernels/alignment.hpp" @@ -45,13 +46,7 @@ #include "utils/sycl_utils.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace copy_and_cast +namespace dpctl::tensor::kernels::copy_and_cast { using dpctl::tensor::ssize_t; @@ -1282,7 +1277,4 @@ struct CopyForRollNDShiftFactory } }; -} // namespace copy_and_cast -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::copy_and_cast diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp index b4f367448758..37126a22dc64 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -33,11 +33,12 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include +#include #include #include #include -#include +#include #include "dpctl_tensor_types.hpp" #include "kernels/alignment.hpp" @@ -45,13 +46,7 @@ #include "utils/sycl_utils.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace copy_as_contig +namespace dpctl::tensor::kernels::copy_as_contig { using dpctl::tensor::ssize_t; @@ -648,8 +643,4 @@ struct AsCContigNDBatchOfSquareMatricesFactory return as_c_contiguous_nd_batch_of_square_matrices_impl; } }; - -} // namespace copy_as_contig -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::copy_as_contig diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp index 0458aa75ac32..3d20be02f885 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -32,21 +32,15 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -#include -#include +#include #include -#include -#include #include -#include -#include +#include #include +#include #include "dpnp4pybind11.hpp" -#include -#include #include -#include #include "kernels/copy_and_cast.hpp" #include "utils/memory_overlap.hpp" @@ -54,16 +48,11 @@ #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" #include "copy_as_contig.hpp" #include "simplify_iteration_space.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace td_ns = dpctl::tensor::type_dispatch; @@ -305,6 +294,4 @@ void init_copy_and_cast_usm_to_usm_dispatch_tables(void) dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp index d2a2dcaf7b85..d2e07b08d38f 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp @@ -38,13 +38,8 @@ #include #include "dpnp4pybind11.hpp" -#include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair copy_usm_ndarray_into_usm_ndarray( @@ -55,6 +50,4 @@ extern std::pair copy_usm_ndarray_into_usm_ndarray( extern void init_copy_and_cast_usm_to_usm_dispatch_tables(); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp index 53b39ff5874c..7105202fe2ff 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp @@ -32,10 +32,11 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -#include #include +#include #include #include +#include #include #include @@ -54,13 +55,10 @@ #include "copy_as_contig.hpp" #include "simplify_iteration_space.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { +namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::kernels::copy_as_contig:: @@ -753,6 +751,4 @@ std::pair ascontig_ev); } -} // end of namespace py_internal -} // end of namespace tensor -} // end of namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp index 2de67098b7fa..bfe3159c8813 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp @@ -32,14 +32,9 @@ #include #include "dpnp4pybind11.hpp" -#include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { std::pair @@ -56,6 +51,4 @@ std::pair void init_copy_as_contig_dispatch_vectors(void); -} // end of namespace py_internal -} // end of namespace tensor -} // end of namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp index 51eb7dba1b6c..97a8ba83831e 100644 --- a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp @@ -39,13 +39,11 @@ #include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { +namespace py = pybind11; + namespace { @@ -61,7 +59,6 @@ std::string _default_device_fp_type(const sycl::device &d) int get_numpy_major_version() { - namespace py = pybind11; py::module_ numpy = py::module_::import("numpy"); py::str version_string = numpy.attr("__version__"); @@ -179,6 +176,4 @@ std::string default_device_index_type(const py::object &arg) return _default_device_index_type(d); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp index 6ea01dcd49d7..adde7aefe3dd 100644 --- a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp @@ -36,14 +36,8 @@ #include "dpnp4pybind11.hpp" #include -#include -#include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::string default_device_fp_type(const py::object &); @@ -53,6 +47,4 @@ extern std::string default_device_bool_type(const py::object &); extern std::string default_device_complex_type(const py::object &); extern std::string default_device_index_type(const py::object &); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp index 2526f022e0ac..e3cff701ed50 100644 --- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp @@ -34,15 +34,13 @@ #include "simplify_iteration_space.hpp" #include "utils/strided_iters.hpp" +#include #include +#include #include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace py = pybind11; @@ -539,6 +537,4 @@ std::vector _unravel_index_f(py::ssize_t flat_index, return mi; } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp index d3448ee1f5fd..acbc833157d1 100644 --- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp @@ -36,11 +36,7 @@ #include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace py = pybind11; @@ -125,6 +121,5 @@ std::vector _unravel_index_c(py::ssize_t, std::vector const &); std::vector _unravel_index_f(py::ssize_t, std::vector const &); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 911d75ebd925..be69ee1a8c7e 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -32,15 +32,17 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -#include -#include -#include +// #include +// #include +// #include #include #include #include -#include +// #include +#include #include #include +#include #include "dpnp4pybind11.hpp" From cfa6cd69735591e79ca3437cc05c326ce115ffc9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 11:31:42 -0800 Subject: [PATCH 30/50] Remove linear-sequence implementations --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/constructors.hpp | 177 ---------- .../libtensor/source/linear_sequences.cpp | 312 ------------------ .../libtensor/source/linear_sequences.hpp | 69 ---- .../tensor/libtensor/source/tensor_ctors.cpp | 38 +-- 5 files changed, 19 insertions(+), 579 deletions(-) delete mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp delete mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 1375c8316754..baf8ef5ce5f6 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -48,7 +48,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 8d53655b2754..f43614e13766 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -58,189 +58,12 @@ using dpctl::tensor::ssize_t; @defgroup CtorKernels */ -template -class linear_sequence_step_kernel; -template -class linear_sequence_affine_kernel; template class full_strided_kernel; // template class eye_kernel; using namespace dpctl::tensor::offset_utils; -template -class LinearSequenceStepFunctor -{ -private: - Ty *p = nullptr; - Ty start_v; - Ty step_v; - -public: - LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) - : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) - { - } - - void operator()(sycl::id<1> wiid) const - { - auto i = wiid.get(0); - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - p[i] = Ty{start_v.real() + i * step_v.real(), - start_v.imag() + i * step_v.imag()}; - } - else { - p[i] = start_v + i * step_v; - } - } -}; - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by typed starting value and - * increment. - * - * @param q Sycl queue to which the kernel is submitted - * @param nelems Length of the sequence - * @param start_v Typed starting value of the sequence - * @param step_v Typed increment of the sequence - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_step_impl(sycl::queue &exec_q, - std::size_t nelems, - Ty start_v, - Ty step_v, - char *array_data, - const std::vector &depends) -{ - dpctl::tensor::type_utils::validate_type_for_device(exec_q); - sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>{nelems}, - LinearSequenceStepFunctor(array_data, start_v, step_v)); - }); - - return lin_space_step_event; -} - -// Constructor to populate tensor with linear sequence defined by -// start and and data - -template -class LinearSequenceAffineFunctor -{ -private: - Ty *p = nullptr; - Ty start_v; - Ty end_v; - std::size_t n; - -public: - LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den) - : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), - n((den == 0) ? 1 : den) - { - } - - void operator()(sycl::id<1> wiid) const - { - auto i = wiid.get(0); - wTy wc = wTy(i) / n; - wTy w = wTy(n - i) / n; - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - using reT = typename Ty::value_type; - auto _w = static_cast(w); - auto _wc = static_cast(wc); - auto re_comb = sycl::fma(start_v.real(), _w, reT(0)); - re_comb = - sycl::fma(end_v.real(), _wc, - re_comb); // start_v.real() * _w + end_v.real() * _wc; - auto im_comb = - sycl::fma(start_v.imag(), _w, - reT(0)); // start_v.imag() * _w + end_v.imag() * _wc; - im_comb = sycl::fma(end_v.imag(), _wc, im_comb); - Ty affine_comb = Ty{re_comb, im_comb}; - p[i] = affine_comb; - } - else if constexpr (std::is_floating_point::value) { - Ty _w = static_cast(w); - Ty _wc = static_cast(wc); - auto affine_comb = - sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc; - affine_comb = sycl::fma(end_v, _wc, affine_comb); - p[i] = affine_comb; - } - else { - using dpctl::tensor::type_utils::convert_impl; - auto affine_comb = start_v * w + end_v * wc; - p[i] = convert_impl(affine_comb); - } - } -}; - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by typed starting and end values. - * - * @param exec_q Sycl queue to which kernel is submitted for execution. - * @param nelems Length of the sequence. - * @param start_v Stating value of the sequence. - * @param end_v End-value of the sequence. - * @param include_endpoint Whether the end-value is included in the sequence. - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_affine_impl(sycl::queue &exec_q, - std::size_t nelems, - Ty start_v, - Ty end_v, - bool include_endpoint, - char *array_data, - const std::vector &depends) -{ - dpctl::tensor::type_utils::validate_type_for_device(exec_q); - - const bool device_supports_doubles = - exec_q.get_device().has(sycl::aspect::fp64); - const std::size_t den = (include_endpoint) ? nelems - 1 : nelems; - - sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - if (device_supports_doubles) { - using KernelName = linear_sequence_affine_kernel; - using Impl = LinearSequenceAffineFunctor; - - cgh.parallel_for(sycl::range<1>{nelems}, - Impl(array_data, start_v, end_v, den)); - } - else { - using KernelName = linear_sequence_affine_kernel; - using Impl = LinearSequenceAffineFunctor; - - cgh.parallel_for(sycl::range<1>{nelems}, - Impl(array_data, start_v, end_v, den)); - } - }); - - return lin_space_affine_event; -} - /* ================ Full ================== */ /*! diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp deleted file mode 100644 index 02c4a8ad0fa1..000000000000 --- a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp +++ /dev/null @@ -1,312 +0,0 @@ -//***************************************************************************** -// Copyright (c) 2026, Intel Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// - Neither the name of the copyright holder nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -// THE POSSIBILITY OF SUCH DAMAGE. -//***************************************************************************** -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#include "dpnp4pybind11.hpp" -#include -#include -#include -#include -#include -#include -#include - -#include "kernels/constructors.hpp" -#include "utils/output_validation.hpp" -#include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" - -#include "linear_sequences.hpp" - -namespace py = pybind11; -namespace td_ns = dpctl::tensor::type_dispatch; - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -// Constructor to populate tensor with linear sequence defined by -// start and step data - -typedef sycl::event (*lin_space_step_fn_ptr_t)( - sycl::queue &, - std::size_t, // num_elements - const py::object &start, - const py::object &step, - char *, // dst_data_ptr - const std::vector &); - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by starting value and increment - * given as Python objects. - * - * @param q Sycl queue to which the kernel is submitted - * @param nelems Length of the sequence - * @param start Starting value of the sequence as Python object. Must be - * convertible to array element data type `Ty`. - * @param step Increment of the sequence as Python object. Must be convertible - * to array element data type `Ty`. - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_step_impl(sycl::queue &exec_q, - std::size_t nelems, - const py::object &start, - const py::object &step, - char *array_data, - const std::vector &depends) -{ - Ty start_v = py::cast(start); - Ty step_v = py::cast(step); - - using dpctl::tensor::kernels::constructors::lin_space_step_impl; - - auto lin_space_step_event = lin_space_step_impl( - exec_q, nelems, start_v, step_v, array_data, depends); - - return lin_space_step_event; -} - -typedef sycl::event (*lin_space_affine_fn_ptr_t)( - sycl::queue &, - std::size_t, // num_elements - const py::object &start, - const py::object &end, - bool include_endpoint, - char *, // dst_data_ptr - const std::vector &); - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by starting and end values given - * as Python objects. - * - * @param exec_q Sycl queue to which kernel is submitted for execution. - * @param nelems Length of the sequence - * @param start Stating value of the sequence as Python object. Must be - * convertible to array data element type `Ty`. - * @param end End-value of the sequence as Python object. Must be convertible - * to array data element type `Ty`. - * @param include_endpoint Whether the end-value is included in the sequence - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_affine_impl(sycl::queue &exec_q, - std::size_t nelems, - const py::object &start, - const py::object &end, - bool include_endpoint, - char *array_data, - const std::vector &depends) -{ - Ty start_v = py::cast(start); - Ty end_v = py::cast(end); - - using dpctl::tensor::kernels::constructors::lin_space_affine_impl; - - auto lin_space_affine_event = lin_space_affine_impl( - exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); - - return lin_space_affine_event; -} - -using dpctl::utils::keep_args_alive; - -static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types]; - -static lin_space_affine_fn_ptr_t - lin_space_affine_dispatch_vector[td_ns::num_types]; - -std::pair - usm_ndarray_linear_sequence_step(const py::object &start, - const py::object &dt, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends) -{ - // dst must be 1D and C-contiguous - // start, end should be coercible into data type of dst - - if (dst.get_ndim() != 1) { - throw py::value_error( - "usm_ndarray_linspace: Expecting 1D array to populate"); - } - - if (!dst.is_c_contiguous()) { - throw py::value_error( - "usm_ndarray_linspace: Non-contiguous arrays are not supported"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { - throw py::value_error( - "Execution queue is not compatible with the allocation queue"); - } - - dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - - auto array_types = td_ns::usm_ndarray_types(); - int dst_typenum = dst.get_typenum(); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - py::ssize_t len = dst.get_shape(0); - if (len == 0) { - // nothing to do - return std::make_pair(sycl::event{}, sycl::event{}); - } - - char *dst_data = dst.get_data(); - sycl::event linspace_step_event; - - auto fn = lin_space_step_dispatch_vector[dst_typeid]; - - linspace_step_event = - fn(exec_q, static_cast(len), start, dt, dst_data, depends); - - return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), - linspace_step_event); -} - -std::pair - usm_ndarray_linear_sequence_affine(const py::object &start, - const py::object &end, - const dpctl::tensor::usm_ndarray &dst, - bool include_endpoint, - sycl::queue &exec_q, - const std::vector &depends) -{ - // dst must be 1D and C-contiguous - // start, end should be coercible into data type of dst - - if (dst.get_ndim() != 1) { - throw py::value_error( - "usm_ndarray_linspace: Expecting 1D array to populate"); - } - - if (!dst.is_c_contiguous()) { - throw py::value_error( - "usm_ndarray_linspace: Non-contiguous arrays are not supported"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { - throw py::value_error( - "Execution queue context is not the same as allocation context"); - } - - dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - - auto array_types = td_ns::usm_ndarray_types(); - int dst_typenum = dst.get_typenum(); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - py::ssize_t len = dst.get_shape(0); - if (len == 0) { - // nothing to do - return std::make_pair(sycl::event{}, sycl::event{}); - } - - char *dst_data = dst.get_data(); - sycl::event linspace_affine_event; - - auto fn = lin_space_affine_dispatch_vector[dst_typeid]; - - linspace_affine_event = fn(exec_q, static_cast(len), start, - end, include_endpoint, dst_data, depends); - - return std::make_pair( - keep_args_alive(exec_q, {dst}, {linspace_affine_event}), - linspace_affine_event); -} - -/*! - * @brief Factor to get function pointer of type `fnT` for array with elements - * of type `Ty`. - * @defgroup CtorKernels - */ -template -struct LinSpaceStepFactory -{ - fnT get() - { - fnT f = lin_space_step_impl; - return f; - } -}; - -/*! - * @brief Factory to get function pointer of type `fnT` for array data type - * `Ty`. - */ -template -struct LinSpaceAffineFactory -{ - fnT get() - { - fnT f = lin_space_affine_impl; - return f; - } -}; - -void init_linear_sequences_dispatch_vectors(void) -{ - using namespace td_ns; - - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector); - - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector); -} - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp deleted file mode 100644 index 321cd2f23efe..000000000000 --- a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp +++ /dev/null @@ -1,69 +0,0 @@ -//***************************************************************************** -// Copyright (c) 2026, Intel Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// - Neither the name of the copyright holder nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -// THE POSSIBILITY OF SUCH DAMAGE. -//***************************************************************************** -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#pragma once -#include -#include -#include - -#include "dpnp4pybind11.hpp" -#include - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -extern std::pair usm_ndarray_linear_sequence_step( - const py::object &start, - const py::object &dt, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}); - -extern std::pair usm_ndarray_linear_sequence_affine( - const py::object &start, - const py::object &end, - const dpctl::tensor::usm_ndarray &dst, - bool include_endpoint, - sycl::queue &exec_q, - const std::vector &depends = {}); - -extern void init_linear_sequences_dispatch_vectors(void); - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index f2afce105f7f..7e4253c0cbb6 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -57,7 +57,7 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" -#include "linear_sequences.hpp" +// #include "linear_sequences.hpp" // #include "repeat.hpp" #include "simplify_iteration_space.hpp" #include "triul_ctor.hpp" @@ -98,8 +98,8 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ============= linear-sequence ==================== */ -using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; -using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; /* ================ Full ================== */ @@ -158,7 +158,7 @@ void init_dispatch_vectors(void) init_copy_as_contig_dispatch_vectors(); // init_copy_for_reshape_dispatch_vectors(); // init_copy_for_roll_dispatch_vectors(); - init_linear_sequences_dispatch_vectors(); + // init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); @@ -300,22 +300,20 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = // py::list()); - m.def("_linspace_step", &usm_ndarray_linear_sequence_step, - "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - "specified by " - "starting point `start` and step `dt`. " - "Returns a tuple of events: (ht_event, comp_event)", - py::arg("start"), py::arg("dt"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, - "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - "specified by " - "starting point `start` and end point `end`. " - "Returns a tuple of events: (ht_event, comp_event)", - py::arg("start"), py::arg("end"), py::arg("dst"), - py::arg("include_endpoint"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); + // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + // "Fills input 1D contiguous usm_ndarray `dst` with linear + // sequence " "specified by " "starting point `start` and step + // `dt`. " "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("start"), py::arg("dt"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + // "Fills input 1D contiguous usm_ndarray `dst` with linear + // sequence " "specified by " "starting point `start` and end + // point `end`. " "Returns a tuple of events: (ht_event, + // comp_event)", py::arg("start"), py::arg("end"), py::arg("dst"), + // py::arg("include_endpoint"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); // m.def("_copy_numpy_ndarray_into_usm_ndarray", // ©_numpy_ndarray_into_usm_ndarray, From 087a2ecbfff6262224ff115c9948202ecf45e6ba Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 11:58:15 -0800 Subject: [PATCH 31/50] Use _tensor_impl from dpctl_ext in dpnp --- dpnp/dpnp_algo/dpnp_fill.py | 3 +++ dpnp/dpnp_iface.py | 1 + dpnp/dpnp_iface_indexing.py | 11 +++++++---- dpnp/fft/dpnp_utils_fft.py | 14 +++++++++++--- dpnp/linalg/dpnp_utils_linalg.py | 3 +++ dpnp/scipy/linalg/_utils.py | 1 + 6 files changed, 26 insertions(+), 7 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index 0d6640c3b8b5..4137a2794747 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -33,6 +33,9 @@ from dpctl.tensor._ctors import _cast_fill_val import dpnp + +# TODO: revert to `from dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _full_usm_ndarray, diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 50b474014666..533bdc36c617 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -50,6 +50,7 @@ import numpy from dpctl.tensor._device import normalize_queue_device +# pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 6421f39fd4e4..a01a036e16cc 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -45,7 +45,6 @@ from collections.abc import Iterable import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._copy_utils import _nonzero_impl @@ -53,7 +52,11 @@ from dpctl.tensor._numpy_helper import normalize_axis_index import dpctl_ext.tensor as dpt_ext -import dpctl_ext.tensor._tensor_impl as ti_ext + +# pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti import dpnp # pylint: disable=no-name-in-module @@ -297,7 +300,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): "Input and output allocation queues are not compatible" ) - if ti_ext._array_overlap(x, out): + if ti._array_overlap(x, out): # Allocate a temporary buffer to avoid memory overlapping. out = dpt.empty_like(out) else: @@ -306,7 +309,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): _manager = dpu.SequentialOrderManager[q] dep_evs = _manager.submitted_events - h_ev, take_ev = ti_ext._take( + h_ev, take_ev = ti._take( src=x, ind=(inds,), dst=out, diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index c692774a424f..60f89a933284 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -42,6 +42,11 @@ from collections.abc import Sequence import dpctl + +# pylint: disable=no-name-in-module +# TODO: remove it when ti.__linspace_step +# is migrated to dpctl_ext/tensor +import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( @@ -50,7 +55,10 @@ ) from dpctl.utils import ExecutionPlacementError -import dpctl_ext.tensor._tensor_impl as ti +# pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti_ext import dpnp import dpnp.backend.extensions.fft._fft_impl as fi @@ -196,7 +204,7 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides): if ( out is not None and out.strides == tuple(out_strides) - and not ti._array_overlap(a_usm, dpnp.get_usm_ndarray(out)) + and not ti_ext._array_overlap(a_usm, dpnp.get_usm_ndarray(out)) ): res_usm = dpnp.get_usm_ndarray(out) result = out @@ -524,7 +532,7 @@ def _truncate_or_pad(a, shape, axes): ) _manager = dpu.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events - ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_ev, copy_ev = ti_ext._copy_usm_ndarray_into_usm_ndarray( src=dpnp.get_usm_ndarray(a), dst=z.get_array()[tuple(index)], sycl_queue=exec_q, diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 5fb1c099dde2..171ac38a141c 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -47,6 +47,9 @@ from dpctl.tensor._numpy_helper import normalize_axis_index from numpy import prod +# pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py index ce832d8f4529..665a4e1595ad 100644 --- a/dpnp/scipy/linalg/_utils.py +++ b/dpnp/scipy/linalg/_utils.py @@ -44,6 +44,7 @@ import dpctl.utils as dpu +# pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti From f4492fbc8048d2fcc598a089715b85ed6504f02d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 12:28:16 -0800 Subject: [PATCH 32/50] Add missing include --- .../tensor/libtensor/include/kernels/constructors.hpp | 3 ++- .../include/kernels/integer_advanced_indexing.hpp | 4 +--- dpctl_ext/tensor/libtensor/source/full_ctor.cpp | 8 ++++---- dpctl_ext/tensor/libtensor/source/full_ctor.hpp | 5 ++++- .../libtensor/source/integer_advanced_indexing.cpp | 10 ++++++---- .../libtensor/source/integer_advanced_indexing.hpp | 6 +++++- dpctl_ext/tensor/libtensor/source/triul_ctor.cpp | 3 +-- dpctl_ext/tensor/libtensor/source/triul_ctor.hpp | 2 ++ dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp | 7 ++----- dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp | 3 ++- 10 files changed, 29 insertions(+), 22 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index f43614e13766..3bc4a1d16271 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -33,8 +33,9 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include #include +#include #include diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 1b2c79d2e2a5..d0ec5227731c 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -33,12 +33,10 @@ //===----------------------------------------------------------------------===// #pragma once -#include -#include #include -#include #include #include +#include #include "dpctl_tensor_types.hpp" #include "utils/indexing_utils.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp index e1f61be4a12a..279bb9f470bc 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -32,15 +32,15 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// -#include #include -#include -#include +#include +#include #include #include +#include + #include "dpnp4pybind11.hpp" -#include #include #include "kernels/constructors.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp index d664b2013506..43b30fc8341c 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp @@ -33,13 +33,16 @@ //===--------------------------------------------------------------------===// #pragma once -#include #include #include +#include + #include "dpnp4pybind11.hpp" #include +namespace py = pybind11; + namespace dpctl { namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp index 244acfe3955f..ed72096bff8f 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -34,21 +34,23 @@ //===----------------------------------------------------------------------===// #include -#include #include #include +#include +#include #include -#include +#include #include +#include + +#include #include "dpnp4pybind11.hpp" -#include #include #include #include "kernels/integer_advanced_indexing.hpp" #include "utils/memory_overlap.hpp" -#include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp index 57f0ddda132c..5dfbd2f04d93 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -34,13 +34,17 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include #include #include +#include + #include "dpnp4pybind11.hpp" #include +namespace py = pybind11; + namespace dpctl { namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp index 0890dfdb4766..f0f592c52938 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp @@ -34,8 +34,8 @@ #include // for std::copy #include // for std::size_t +#include // for std::begin, std::end #include // for std::make_shared -#include // for std::runtime_error #include // for std::pair, std::move #include // for std::vector, std::begin, std::end @@ -47,7 +47,6 @@ #include "kernels/constructors.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" -#include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp index 08889df6227f..c61d95eef7ec 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp @@ -40,6 +40,8 @@ #include "dpnp4pybind11.hpp" #include +namespace py = pybind11; + namespace dpctl { namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp index 4558743b3c22..d7370f55e8cb 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -32,21 +32,18 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// -#include #include #include -#include #include #include +#include + #include "dpnp4pybind11.hpp" -#include #include -#include "kernels/constructors.hpp" #include "utils/output_validation.hpp" #include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" #include "zeros_ctor.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp index 51270a3443cc..ec3bce994ef6 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp @@ -33,10 +33,11 @@ //===--------------------------------------------------------------------===// #pragma once -#include #include #include +#include + #include "dpnp4pybind11.hpp" #include From b367c9fd3b4b538e132afb5838584137a6f8a25c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 12:36:24 -0800 Subject: [PATCH 33/50] Use nested namespace syntax --- .../libtensor/include/kernels/constructors.hpp | 13 ++----------- .../include/kernels/integer_advanced_indexing.hpp | 13 ++----------- dpctl_ext/tensor/libtensor/source/full_ctor.cpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/full_ctor.hpp | 10 ++-------- .../libtensor/source/integer_advanced_indexing.cpp | 10 ++-------- .../libtensor/source/integer_advanced_indexing.hpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/triul_ctor.cpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/triul_ctor.hpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp | 10 ++-------- 10 files changed, 20 insertions(+), 86 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 3bc4a1d16271..47726319b3e1 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -44,13 +44,7 @@ #include "utils/strided_iters.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace constructors +namespace dpctl::tensor::kernels::constructors { using dpctl::tensor::ssize_t; @@ -305,7 +299,4 @@ struct TriuGenericFactory } }; -} // namespace constructors -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::constructors diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index d0ec5227731c..7351502dbc11 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -43,13 +43,7 @@ #include "utils/offset_utils.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace indexing +namespace dpctl::tensor::kernels::indexing { using dpctl::tensor::ssize_t; @@ -419,7 +413,4 @@ struct PutClipFactory } }; -} // namespace indexing -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::indexing diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp index 279bb9f470bc..ca4a17f28f77 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -53,11 +53,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { using dpctl::utils::keep_args_alive; @@ -310,6 +306,4 @@ void init_full_ctor_dispatch_vectors(void) dvb2.populate_dispatch_vector(full_strided_dispatch_vector); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp index 43b30fc8341c..18c15de87a40 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp @@ -43,11 +43,7 @@ namespace py = pybind11; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -58,6 +54,4 @@ extern std::pair extern void init_full_ctor_dispatch_vectors(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp index ed72096bff8f..77322381d517 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -62,11 +62,7 @@ #define WRAP_MODE 0 #define CLIP_MODE 1 -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace td_ns = dpctl::tensor::type_dispatch; @@ -816,6 +812,4 @@ void init_advanced_indexing_dispatch_tables(void) dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp index 5dfbd2f04d93..bc0136288e1c 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -45,11 +45,7 @@ namespace py = pybind11; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -72,6 +68,4 @@ extern std::pair extern void init_advanced_indexing_dispatch_tables(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp index f0f592c52938..13e909196460 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp @@ -54,11 +54,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { using dpctl::utils::keep_args_alive; @@ -247,6 +243,4 @@ void init_triul_ctor_dispatch_vectors(void) dvb2.populate_dispatch_vector(triu_generic_dispatch_vector); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp index c61d95eef7ec..47cc4ce8892d 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp @@ -42,11 +42,7 @@ namespace py = pybind11; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -59,6 +55,4 @@ extern std::pair extern void init_triul_ctor_dispatch_vectors(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp index d7370f55e8cb..b9a2e01bea4a 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -50,11 +50,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { using dpctl::utils::keep_args_alive; @@ -160,6 +156,4 @@ void init_zeros_ctor_dispatch_vectors(void) return; } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp index ec3bce994ef6..51a1903a0f36 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp @@ -41,11 +41,7 @@ #include "dpnp4pybind11.hpp" #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -55,6 +51,4 @@ extern std::pair extern void init_zeros_ctor_dispatch_vectors(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal From 3113716a13a131dc44f819140489176be5ff7cba Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 02:50:47 -0800 Subject: [PATCH 34/50] Add missing include complex --- dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp | 1 + .../libtensor/include/kernels/integer_advanced_indexing.hpp | 4 +++- dpctl_ext/tensor/libtensor/source/full_ctor.cpp | 2 ++ .../tensor/libtensor/source/integer_advanced_indexing.cpp | 2 ++ dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp | 2 ++ 5 files changed, 10 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 47726319b3e1..22189ee3129c 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -34,6 +34,7 @@ #pragma once #include +#include #include #include diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 7351502dbc11..7be2b3ea8591 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -33,11 +33,13 @@ //===----------------------------------------------------------------------===// #pragma once +#include #include -#include #include #include +#include + #include "dpctl_tensor_types.hpp" #include "utils/indexing_utils.hpp" #include "utils/offset_utils.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp index ca4a17f28f77..aef57836666e 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -32,6 +32,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include @@ -41,6 +42,7 @@ #include #include "dpnp4pybind11.hpp" +#include #include #include "kernels/constructors.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp index 77322381d517..925cc2e895ed 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -34,6 +34,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include @@ -46,6 +47,7 @@ #include #include "dpnp4pybind11.hpp" +#include #include #include diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp index b9a2e01bea4a..2eb05e49f382 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -32,6 +32,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include @@ -40,6 +41,7 @@ #include #include "dpnp4pybind11.hpp" +#include #include #include "utils/output_validation.hpp" From 978afee9115d8feaebe72c80ce3e827e13c66770 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 03:13:50 -0800 Subject: [PATCH 35/50] Add missing memory and queue checks --- .../libtensor/source/copy_as_contig.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp index 7105202fe2ff..bbee24c95d4d 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp @@ -189,6 +189,12 @@ std::pair "Execution queue is not compatible with allocation queues"); } + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + const auto &src_strides_vec = src.get_strides_vector(); if (src_nd >= 2) { @@ -314,6 +320,12 @@ std::pair "Execution queue is not compatible with allocation queues"); } + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + const auto &src_strides_vec = src.get_strides_vector(); if (src_nd >= 2) { @@ -459,6 +471,12 @@ std::pair "Execution queue is not compatible with allocation queues"); } + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + if (nelems == 0) { // nothing to do return std::make_pair(sycl::event(), sycl::event()); @@ -624,6 +642,20 @@ std::pair throw py::value_error("Unexpected destination array layout"); } + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + int src_typenum = src.get_typenum(); int dst_typenum = dst.get_typenum(); From fec84ec7eafcfbd9a3d0e80f6a1c0e35c5312769 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 04:17:56 -0800 Subject: [PATCH 36/50] Move ti._copy_numpy_ndarray_into_usm_ndarray() --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../copy_numpy_ndarray_into_usm_ndarray.cpp | 368 ++++++++++++++++++ .../copy_numpy_ndarray_into_usm_ndarray.hpp | 57 +++ .../tensor/libtensor/source/tensor_ctors.cpp | 16 +- 4 files changed, 434 insertions(+), 9 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 53a0dfd27ba3..4e3fa580f99e 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -48,7 +48,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp new file mode 100644 index 000000000000..e97e8aeb1ca1 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -0,0 +1,368 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_numpy_ndarray_into_usm_ndarray.hpp" +#include "simplify_iteration_space.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::tensor::kernels::copy_and_cast:: + copy_and_cast_from_host_blocking_fn_ptr_t; + +static copy_and_cast_from_host_blocking_fn_ptr_t + copy_and_cast_from_host_blocking_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::copy_and_cast:: + copy_and_cast_from_host_contig_blocking_fn_ptr_t; + +static copy_and_cast_from_host_contig_blocking_fn_ptr_t + copy_and_cast_from_host_contig_blocking_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void copy_numpy_ndarray_into_usm_ndarray( + const py::array &npy_src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_ndim = npy_src.ndim(); + int dst_ndim = dst.get_ndim(); + + if (src_ndim != dst_ndim) { + throw py::value_error("Source ndarray and destination usm_ndarray have " + "different array ranks, " + "i.e. different number of indices needed to " + "address array elements."); + } + + const py::ssize_t *src_shape = npy_src.shape(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool shapes_equal(true); + std::size_t src_nelems(1); + for (int i = 0; shapes_equal && (i < src_ndim); ++i) { + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + src_nelems *= static_cast(src_shape[i]); + } + + if (!shapes_equal) { + throw py::value_error("Source ndarray and destination usm_ndarray have " + "difference shapes."); + } + + if (src_nelems == 0) { + // nothing to do + return; + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error("Execution queue is not compatible with the " + "allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // here we assume that NumPy's type numbers agree with ours for types + // supported in both + int src_typenum = + py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num; + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + py::buffer_info src_pybuf = npy_src.request(); + const char *const src_data = static_cast(src_pybuf.ptr); + char *dst_data = dst.get_data(); + + int src_flags = npy_src.flags(); + + // check for applicability of special cases: + // (same type && (both C-contiguous || both F-contiguous) + const bool both_c_contig = + ((src_flags & py::array::c_style) && dst.is_c_contiguous()); + const bool both_f_contig = + ((src_flags & py::array::f_style) && dst.is_f_contiguous()); + + const bool same_data_types = (src_type_id == dst_type_id); + + if (both_c_contig || both_f_contig) { + if (same_data_types) { + int src_elem_size = npy_src.itemsize(); + + sycl::event copy_ev = + exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); + + { + // wait for copy_ev to complete + // release GIL to allow other threads (host_tasks) + // a chance to acquire GIL + py::gil_scoped_release lock{}; + copy_ev.wait(); + } + + return; + } + else { + py::gil_scoped_release lock{}; + + auto copy_and_cast_from_host_contig_blocking_fn = + copy_and_cast_from_host_contig_blocking_dispatch_table + [dst_type_id][src_type_id]; + + static constexpr py::ssize_t zero_offset(0); + + copy_and_cast_from_host_contig_blocking_fn( + exec_q, src_nelems, src_data, zero_offset, dst_data, + zero_offset, depends); + + return; + } + } + + auto const &dst_strides = + dst.get_strides_vector(); // N.B.: strides in elements + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_ndim; + const py::ssize_t *shape = src_shape; + + const py::ssize_t *src_strides_p = + npy_src.strides(); // N.B.: strides in bytes + py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes + + bool is_src_c_contig = ((src_flags & py::array::c_style) != 0); + bool is_src_f_contig = ((src_flags & py::array::f_style) != 0); + + shT src_strides_in_elems; + if (src_strides_p) { + src_strides_in_elems.resize(nd); + // copy and convert strides from bytes to elements + std::transform( + src_strides_p, src_strides_p + nd, std::begin(src_strides_in_elems), + [src_itemsize](py::ssize_t el) { + py::ssize_t q = el / src_itemsize; + if (q * src_itemsize != el) { + throw std::runtime_error( + "NumPy array strides are not multiple of itemsize"); + } + return q; + }); + } + else { + if (is_src_c_contig) { + src_strides_in_elems = + dpctl::tensor::c_contiguous_strides(nd, src_shape); + } + else if (is_src_f_contig) { + src_strides_in_elems = + dpctl::tensor::f_contiguous_strides(nd, src_shape); + } + else { + throw py::value_error("NumPy source array has null strides but is " + "neither C- nor F-contiguous."); + } + } + + // nd, simplified_* vectors and offsets are modified by reference + simplify_iteration_space(nd, shape, src_strides_in_elems, dst_strides, + // outputs + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + // handle nd == 0 + if (nd == 0) { + nd = 1; + simplified_shape.reserve(nd); + simplified_shape.push_back(1); + + simplified_src_strides.reserve(nd); + simplified_src_strides.push_back(1); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.push_back(1); + } + + const bool is_contig_vector = + ((nd == 1) && (simplified_src_strides.front() == 1) && + (simplified_dst_strides.front() == 1)); + + const bool can_use_memcpy = (same_data_types && is_contig_vector && + (src_offset == 0) && (dst_offset == 0)); + + if (can_use_memcpy) { + int src_elem_size = npy_src.itemsize(); + + sycl::event copy_ev = exec_q.memcpy( + static_cast(dst_data), static_cast(src_data), + src_nelems * src_elem_size, depends); + + { + // wait for copy_ev to complete + // release GIL to allow other threads (host_tasks) + // a chance to acquire GIL + py::gil_scoped_release lock{}; + + copy_ev.wait(); + } + + return; + } + + // Minimum and maximum element offsets for source np.ndarray + py::ssize_t npy_src_min_nelem_offset(src_offset); + py::ssize_t npy_src_max_nelem_offset(src_offset); + for (int i = 0; i < nd; ++i) { + if (simplified_src_strides[i] < 0) { + npy_src_min_nelem_offset += + simplified_src_strides[i] * (simplified_shape[i] - 1); + } + else { + npy_src_max_nelem_offset += + simplified_src_strides[i] * (simplified_shape[i] - 1); + } + } + + if (is_contig_vector) { + // release GIL for the blocking call + py::gil_scoped_release lock{}; + + auto copy_and_cast_from_host_contig_blocking_fn = + copy_and_cast_from_host_contig_blocking_dispatch_table[dst_type_id] + [src_type_id]; + + copy_and_cast_from_host_contig_blocking_fn(exec_q, src_nelems, src_data, + src_offset, dst_data, + dst_offset, depends); + + return; + } + + std::vector host_task_events; + host_task_events.reserve(1); + + // Copy shape strides into device memory + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + { + // release GIL for the blocking call + py::gil_scoped_release lock{}; + + // Get implementation function pointer + auto copy_and_cast_from_host_blocking_fn = + copy_and_cast_from_host_blocking_dispatch_table[dst_type_id] + [src_type_id]; + + copy_and_cast_from_host_blocking_fn( + exec_q, src_nelems, nd, shape_strides, src_data, src_offset, + npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data, + dst_offset, depends, {copy_shape_ev}); + + // invoke USM deleter in smart pointer while GIL is held + shape_strides_owner.reset(nullptr); + } + + return; +} + +void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory; + + DispatchTableBuilder + dtb_copy_from_numpy; + + dtb_copy_from_numpy.populate_dispatch_table( + copy_and_cast_from_host_blocking_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast:: + CopyAndCastFromHostContigFactory; + + DispatchTableBuilder + dtb_copy_from_numpy_contig; + + dtb_copy_from_numpy_contig.populate_dispatch_table( + copy_and_cast_from_host_contig_blocking_dispatch_table); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp new file mode 100644 index 000000000000..f2de95f97cca --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void copy_numpy_ndarray_into_usm_ndarray( + const py::array &npy_src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 182124ac4ae5..07c8fd1bf99f 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -53,7 +53,7 @@ #include "copy_as_contig.hpp" // #include "copy_for_reshape.hpp" // #include "copy_for_roll.hpp" -// #include "copy_numpy_ndarray_into_usm_ndarray.hpp" +#include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" // #include "eye_ctor.hpp" #include "full_ctor.hpp" @@ -96,7 +96,7 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ -// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; +using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; /* ============= linear-sequence ==================== */ @@ -146,7 +146,7 @@ void init_dispatch_tables(void) using namespace dpctl::tensor::py_internal; init_copy_and_cast_usm_to_usm_dispatch_tables(); - // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); + init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); init_advanced_indexing_dispatch_tables(); // init_where_dispatch_tables(); return; @@ -317,11 +317,11 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("include_endpoint"), py::arg("sycl_queue"), // py::arg("depends") = py::list()); - // m.def("_copy_numpy_ndarray_into_usm_ndarray", - // ©_numpy_ndarray_into_usm_ndarray, - // "Copy from numpy array `src` into usm_ndarray `dst` - // synchronously.", py::arg("src"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_copy_numpy_ndarray_into_usm_ndarray", + ©_numpy_ndarray_into_usm_ndarray, + "Copy from numpy array `src` into usm_ndarray `dst` synchronously.", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), From 497e81030c9e64b7129191d4347c7516224fe234 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 04:43:05 -0800 Subject: [PATCH 37/50] Move asnumpy(),from_numpy(), to_numpy() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 8 ++ dpctl_ext/tensor/_copy_utils.py | 202 ++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 dpctl_ext/tensor/_copy_utils.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 3c6939eff7a0..2ce61a9ab242 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -27,6 +27,11 @@ # ***************************************************************************** +from dpctl_ext.tensor._copy_utils import ( + asnumpy, + from_numpy, + to_numpy, +) from dpctl_ext.tensor._ctors import ( full, tril, @@ -38,9 +43,12 @@ ) __all__ = [ + "asnumpy", + "from_numpy", "full", "put", "take", + "to_numpy", "tril", "triu", ] diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py new file mode 100644 index 000000000000..9041be7686f6 --- /dev/null +++ b/dpctl_ext/tensor/_copy_utils.py @@ -0,0 +1,202 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import dpctl.memory as dpm +import dpctl.tensor as dpt +import dpctl.utils +import numpy as np +from dpctl.tensor._device import normalize_queue_device + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti + +__doc__ = ( + "Implementation module for copy- and cast- operations on " + ":class:`dpctl.tensor.usm_ndarray`." +) + +int32_t_max = 1 + np.iinfo(np.int32).max + + +def _copy_to_numpy(ary): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}") + if ary.size == 0: + # no data needs to be copied for zero sized array + return np.ndarray(ary.shape, dtype=ary.dtype) + nb = ary.usm_data.nbytes + q = ary.sycl_queue + hh = dpm.MemoryUSMHost(nb, queue=q) + h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype) + itsz = ary.itemsize + strides_bytes = tuple(si * itsz for si in ary.strides) + offset = ary._element_offset * itsz + # ensure that content of ary.usm_data is final + q.wait() + hh.copy_from_device(ary.usm_data) + return np.ndarray( + ary.shape, + dtype=ary.dtype, + buffer=h, + strides=strides_bytes, + offset=offset, + ) + + +def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None): + """Copies numpy array `np_ary` into a new usm_ndarray""" + # This may perform a copy to meet stated requirements + Xnp = np.require(np_ary, requirements=["A", "E"]) + alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None) + dt = Xnp.dtype + if dt.char in "dD" and alloc_q.sycl_device.has_aspect_fp64 is False: + Xusm_dtype = ( + dpt.dtype("float32") if dt.char == "d" else dpt.dtype("complex64") + ) + else: + Xusm_dtype = dt + Xusm = dpt.empty( + Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue + ) + _copy_from_numpy_into(Xusm, Xnp) + return Xusm + + +def _copy_from_numpy_into(dst, np_ary): + """Copies `np_ary` into `dst` of type :class:`dpctl.tensor.usm_ndarray""" + if not isinstance(np_ary, np.ndarray): + raise TypeError(f"Expected numpy.ndarray, got {type(np_ary)}") + if not isinstance(dst, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(dst)}") + if np_ary.flags["OWNDATA"]: + Xnp = np_ary + else: + # Determine base of input array + base = np_ary.base + while isinstance(base, np.ndarray): + base = base.base + if isinstance(base, dpm._memory._Memory): + # we must perform a copy, since subsequent + # _copy_numpy_ndarray_into_usm_ndarray is implemented using + # sycl::buffer, and using USM-pointers with sycl::buffer + # results is undefined behavior + Xnp = np_ary.copy() + else: + Xnp = np_ary + src_ary = np.broadcast_to(Xnp, dst.shape) + copy_q = dst.sycl_queue + if copy_q.sycl_device.has_aspect_fp64 is False: + src_ary_dt_c = src_ary.dtype.char + if src_ary_dt_c == "d": + src_ary = src_ary.astype(np.float32) + elif src_ary_dt_c == "D": + src_ary = src_ary.astype(np.complex64) + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_ev = _manager.submitted_events + # synchronizing call + ti._copy_numpy_ndarray_into_usm_ndarray( + src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev + ) + + +def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None): + """ + from_numpy(arg, device=None, usm_type="device", sycl_queue=None) + + Creates :class:`dpctl.tensor.usm_ndarray` from instance of + :class:`numpy.ndarray`. + + Args: + arg: + Input convertible to :class:`numpy.ndarray` + device (object): array API specification of device where the + output array is created. Device can be specified by + a filter selector string, an instance of + :class:`dpctl.SyclDevice`, an instance of + :class:`dpctl.SyclQueue`, or an instance of + :class:`dpctl.tensor.Device`. If the value is ``None``, + returned array is created on the default-selected device. + Default: ``None`` + usm_type (str): The requested USM allocation type for the + output array. Recognized values are ``"device"``, + ``"shared"``, or ``"host"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + A SYCL queue that determines output array allocation device + as well as execution placement of data movement operations. + The ``device`` and ``sycl_queue`` arguments + are equivalent. Only one of them should be specified. If both + are provided, they must be consistent and result in using the + same execution queue. Default: ``None`` + + The returned array has the same shape, and the same data type kind. + If the device does not support the data type of input array, a + closest support data type of the same kind may be returned, e.g. + input array of type ``float16`` may be upcast to ``float32`` if the + target device does not support 16-bit floating point type. + """ + q = normalize_queue_device(sycl_queue=sycl_queue, device=device) + return _copy_from_numpy(np_ary, usm_type=usm_type, sycl_queue=q) + + +def to_numpy(usm_ary, /): + """ + to_numpy(usm_ary) + + Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary`` + into :class:`numpy.ndarray` instance of the same shape and same data type. + + Args: + usm_ary (usm_ndarray): + Input array + Returns: + :class:`numpy.ndarray`: + An instance of :class:`numpy.ndarray` populated with content of + ``usm_ary`` + """ + return _copy_to_numpy(usm_ary) + + +def asnumpy(usm_ary): + """ + asnumpy(usm_ary) + + Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary`` + into :class:`numpy.ndarray` instance of the same shape and same data + type. + + Args: + usm_ary (usm_ndarray): + Input array + Returns: + :class:`numpy.ndarray`: + An instance of :class:`numpy.ndarray` populated with content + of ``usm_ary`` + """ + return _copy_to_numpy(usm_ary) From 3be4e1498182891b7d59facdfe4c2a9f6ed3dc9f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 04:45:48 -0800 Subject: [PATCH 38/50] Update dpnp.asnumpy to use dpctl_ext functions --- dpnp/dpnp_array.py | 5 ++++- dpnp/dpnp_iface.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py index 6a2b2fd1977f..d122aff2c13f 100644 --- a/dpnp/dpnp_array.py +++ b/dpnp/dpnp_array.py @@ -41,6 +41,9 @@ import dpctl.tensor._type_utils as dtu from dpctl.tensor._numpy_helper import AxisError +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from . import memory as dpm @@ -764,7 +767,7 @@ def asnumpy(self): """ - return dpt.asnumpy(self._array_obj) + return dpt_ext.asnumpy(self._array_obj) def astype( self, diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 533bdc36c617..6c050a208981 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -53,6 +53,7 @@ # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti import dpnp @@ -136,7 +137,7 @@ def asnumpy(a, order="C"): return a.asnumpy() if isinstance(a, dpt.usm_ndarray): - return dpt.asnumpy(a) + return dpt_ext.asnumpy(a) return numpy.asarray(a, order=order) From 1d883652555cac390ba19d6fa294284690688ed1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 05:02:03 -0800 Subject: [PATCH 39/50] Move copy(), astype() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_copy_utils.py | 553 ++++++++++++++++++++++++++++++++ 2 files changed, 557 insertions(+) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 2ce61a9ab242..a02cf85ed591 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -29,6 +29,8 @@ from dpctl_ext.tensor._copy_utils import ( asnumpy, + astype, + copy, from_numpy, to_numpy, ) @@ -44,6 +46,8 @@ __all__ = [ "asnumpy", + "astype", + "copy", "from_numpy", "full", "put", diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index 9041be7686f6..c62218893a2c 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -26,12 +26,16 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import builtins + import dpctl import dpctl.memory as dpm import dpctl.tensor as dpt import dpctl.utils import numpy as np +from dpctl.tensor._data_types import _get_dtype from dpctl.tensor._device import normalize_queue_device +from dpctl.tensor._type_utils import _dtype_supported_by_device_impl # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor @@ -200,3 +204,552 @@ def asnumpy(usm_ary): of ``usm_ary`` """ return _copy_to_numpy(usm_ary) + + +class Dummy: + """Helper class with specified ``__sycl_usm_array_interface__`` attribute""" + + def __init__(self, iface): + self.__sycl_usm_array_interface__ = iface + + +def _copy_overlapping(dst, src): + """Assumes src and dst have the same shape.""" + q = normalize_queue_device(sycl_queue=dst.sycl_queue) + tmp = dpt.usm_ndarray( + src.shape, + dtype=src.dtype, + buffer="device", + order="C", + buffer_ctor_kwargs={"queue": q}, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray( + src=src, dst=tmp, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hcp1, cp1) + hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp, dst=dst, sycl_queue=q, depends=[cp1] + ) + _manager.add_event_pair(hcp2, cp2) + + +def _copy_same_shape(dst, src): + """Assumes src and dst have the same shape.""" + # check that memory regions do not overlap + if ti._array_overlap(dst, src): + if src._pointer == dst._pointer and ( + src is dst + or (src.strides == dst.strides and src.dtype == dst.dtype) + ): + return + _copy_overlapping(src=src, dst=dst) + return + + copy_q = dst.sycl_queue + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + + +if hasattr(np, "broadcast_shapes"): + + def _broadcast_shapes(sh1, sh2): + return np.broadcast_shapes(sh1, sh2) + +else: + + def _broadcast_shapes(sh1, sh2): + # use arrays with zero strides, whose memory footprint + # is independent of the number of array elements + return np.broadcast( + np.empty(sh1, dtype=[]), + np.empty(sh2, dtype=[]), + ).shape + + +def _broadcast_strides(X_shape, X_strides, res_ndim): + """ + Broadcasts strides to match the given dimensions; + returns tuple type strides. + """ + out_strides = [0] * res_ndim + X_shape_len = len(X_shape) + str_dim = -X_shape_len + for i in range(X_shape_len): + shape_value = X_shape[i] + if not shape_value == 1: + out_strides[str_dim] = X_strides[i] + str_dim += 1 + + return tuple(out_strides) + + +def _copy_from_usm_ndarray_to_usm_ndarray(dst, src): + if any( + not isinstance(arg, dpt.usm_ndarray) + for arg in ( + dst, + src, + ) + ): + raise TypeError( + "Both types are expected to be dpctl.tensor.usm_ndarray, " + f"got {type(dst)} and {type(src)}." + ) + + if dst.ndim == src.ndim and dst.shape == src.shape: + _copy_same_shape(dst, src) + return + + try: + common_shape = _broadcast_shapes(dst.shape, src.shape) + except ValueError as exc: + raise ValueError("Shapes of two arrays are not compatible") from exc + + if dst.size < src.size and dst.size < np.prod(common_shape): + raise ValueError("Destination is smaller ") + + if len(common_shape) > dst.ndim: + ones_count = len(common_shape) - dst.ndim + for k in range(ones_count): + if common_shape[k] != 1: + raise ValueError + common_shape = common_shape[ones_count:] + + if src.ndim < len(common_shape): + new_src_strides = _broadcast_strides( + src.shape, src.strides, len(common_shape) + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src, + strides=new_src_strides, + offset=src._element_offset, + ) + elif src.ndim == len(common_shape): + new_src_strides = _broadcast_strides( + src.shape, src.strides, len(common_shape) + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src, + strides=new_src_strides, + offset=src._element_offset, + ) + else: + # since broadcasting succeeded, src.ndim is greater because of + # leading sequence of ones, so we trim it + n = len(common_shape) + new_src_strides = _broadcast_strides( + src.shape[-n:], src.strides[-n:], n + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src.usm_data, + strides=new_src_strides, + offset=src._element_offset, + ) + + _copy_same_shape(dst, src_same_shape) + + +def _make_empty_like_orderK(x, dt, usm_type, dev): + """ + Returns empty array with shape and strides like `x`, with dtype `dt`, + USM type `usm_type`, on device `dev`. + """ + st = list(x.strides) + perm = sorted( + range(x.ndim), + key=lambda d: builtins.abs(st[d]) if x.shape[d] > 1 else 0, + reverse=True, + ) + inv_perm = sorted(range(x.ndim), key=lambda i: perm[i]) + sh = x.shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if min(st) < 0: + st_sorted = [st[i] for i in perm] + sl = tuple( + ( + slice(None, None, -1) + if st_sorted[i] < 0 + else slice(None, None, None) + ) + for i in range(x.ndim) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def _empty_like_orderK(x, dt, usm_type=None, dev=None): + """ + Returns empty array like `x`, using order='K' + + For an array `x` that was obtained by permutation of a contiguous + array the returned array will have the same shape and the same + strides as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(x)}") + if usm_type is None: + usm_type = x.usm_type + if dev is None: + dev = x.device + fl = x.flags + if fl["C"] or x.size <= 1: + return dpt.empty_like( + x, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + elif fl["F"]: + return dpt.empty_like( + x, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + return _make_empty_like_orderK(x, dt, usm_type, dev) + + +def _from_numpy_empty_like_orderK(x, dt, usm_type, dev): + """ + Returns empty usm_ndarray like NumPy array `x`, using order='K' + + For an array `x` that was obtained by permutation of a contiguous + array the returned array will have the same shape and the same + strides as `x`. + """ + if not isinstance(x, np.ndarray): + raise TypeError(f"Expected numpy.ndarray, got {type(x)}") + fl = x.flags + if fl["C"] or x.size <= 1: + return dpt.empty( + x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + elif fl["F"]: + return dpt.empty( + x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + return _make_empty_like_orderK(x, dt, usm_type, dev) + + +def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev): + if not isinstance(X1, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X1)}") + if not isinstance(X2, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X2)}") + nd1 = X1.ndim + nd2 = X2.ndim + if nd1 > nd2 and X1.shape == res_shape: + return _empty_like_orderK(X1, dt, usm_type, dev) + elif nd1 < nd2 and X2.shape == res_shape: + return _empty_like_orderK(X2, dt, usm_type, dev) + fl1 = X1.flags + fl2 = X2.flags + if fl1["C"] or fl2["C"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + if fl1["F"] and fl2["F"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + st1 = list(X1.strides) + st2 = list(X2.strides) + max_ndim = max(nd1, nd2) + st1 += [0] * (max_ndim - len(st1)) + st2 += [0] * (max_ndim - len(st2)) + sh1 = list(X1.shape) + [0] * (max_ndim - nd1) + sh2 = list(X2.shape) + [0] * (max_ndim - nd2) + perm = sorted( + range(max_ndim), + key=lambda d: ( + builtins.abs(st1[d]) if sh1[d] > 1 else 0, + builtins.abs(st2[d]) if sh2[d] > 1 else 0, + ), + reverse=True, + ) + inv_perm = sorted(range(max_ndim), key=lambda i: perm[i]) + st1_sorted = [st1[i] for i in perm] + st2_sorted = [st2[i] for i in perm] + sh = res_shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if max(min(st1_sorted), min(st2_sorted)) < 0: + sl = tuple( + ( + slice(None, None, -1) + if (st1_sorted[i] < 0 and st2_sorted[i] < 0) + else slice(None, None, None) + ) + for i in range(nd1) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev): + if not isinstance(X1, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X1)}") + if not isinstance(X2, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X2)}") + if not isinstance(X3, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X3)}") + nd1 = X1.ndim + nd2 = X2.ndim + nd3 = X3.ndim + if X1.shape == res_shape and X2.shape == res_shape and len(res_shape) > nd3: + return _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev) + elif ( + X2.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd1 + ): + return _empty_like_pair_orderK(X2, X3, dt, res_shape, usm_type, dev) + elif ( + X1.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd2 + ): + return _empty_like_pair_orderK(X1, X3, dt, res_shape, usm_type, dev) + fl1 = X1.flags + fl2 = X2.flags + fl3 = X3.flags + if fl1["C"] or fl2["C"] or fl3["C"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + if fl1["F"] and fl2["F"] and fl3["F"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + st1 = list(X1.strides) + st2 = list(X2.strides) + st3 = list(X3.strides) + max_ndim = max(nd1, nd2, nd3) + st1 += [0] * (max_ndim - len(st1)) + st2 += [0] * (max_ndim - len(st2)) + st3 += [0] * (max_ndim - len(st3)) + sh1 = list(X1.shape) + [0] * (max_ndim - nd1) + sh2 = list(X2.shape) + [0] * (max_ndim - nd2) + sh3 = list(X3.shape) + [0] * (max_ndim - nd3) + perm = sorted( + range(max_ndim), + key=lambda d: ( + builtins.abs(st1[d]) if sh1[d] > 1 else 0, + builtins.abs(st2[d]) if sh2[d] > 1 else 0, + builtins.abs(st3[d]) if sh3[d] > 1 else 0, + ), + reverse=True, + ) + inv_perm = sorted(range(max_ndim), key=lambda i: perm[i]) + st1_sorted = [st1[i] for i in perm] + st2_sorted = [st2[i] for i in perm] + st3_sorted = [st3[i] for i in perm] + sh = res_shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0: + sl = tuple( + ( + slice(None, None, -1) + if ( + st1_sorted[i] < 0 + and st2_sorted[i] < 0 + and st3_sorted[i] < 0 + ) + else slice(None, None, None) + ) + for i in range(nd1) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def copy(usm_ary, /, *, order="K"): + """copy(ary, order="K") + + Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`. + + Args: + ary (usm_ndarray): + Input array + order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional): + Controls the memory layout of the output array + Returns: + usm_ndarray: + A copy of the input array. + + Memory layout of the copy is controlled by ``order`` keyword, + following NumPy's conventions. The ``order`` keywords can be + one of the following: + + .. list-table:: + + * - ``"C"`` + - C-contiguous memory layout + * - ``"F"`` + - Fortran-contiguous memory layout + * - ``"A"`` + - Fortran-contiguous if the input array is also Fortran-contiguous, + otherwise C-contiguous + * - ``"K"`` + - match the layout of ``usm_ary`` as closely as possible. + + """ + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() + if not isinstance(usm_ary, dpt.usm_ndarray): + raise TypeError( + f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}" + ) + copy_order = "C" + if order == "C": + pass + elif order == "F": + copy_order = order + elif order == "A": + if usm_ary.flags.f_contiguous: + copy_order = "F" + elif order == "K": + if usm_ary.flags.f_contiguous: + copy_order = "F" + else: + raise ValueError( + "Unrecognized value of the order keyword. " + "Recognized values are 'A', 'C', 'F', or 'K'" + ) + if order == "K": + R = _empty_like_orderK(usm_ary, usm_ary.dtype) + else: + R = dpt.usm_ndarray( + usm_ary.shape, + dtype=usm_ary.dtype, + buffer=usm_ary.usm_type, + order=copy_order, + buffer_ctor_kwargs={"queue": usm_ary.sycl_queue}, + ) + _copy_same_shape(R, usm_ary) + return R + + +def astype( + usm_ary, newdtype, /, *, order="K", casting="unsafe", copy=True, device=None +): + """astype(array, new_dtype, order="K", casting="unsafe", \ + copy=True, device=None) + + Returns a copy of the :class:`dpctl.tensor.usm_ndarray`, cast to a + specified type. + + Args: + array (usm_ndarray): + An input array. + new_dtype (dtype): + The data type of the resulting array. If `None`, gives default + floating point type supported by device where the resulting array + will be located. + order ({"C", "F", "A", "K"}, optional): + Controls memory layout of the resulting array if a copy + is returned. + casting ({'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional): + Controls what kind of data casting may occur. Please see + :meth:`numpy.ndarray.astype` for description of casting modes. + copy (bool, optional): + By default, `astype` always returns a newly allocated array. + If this keyword is set to `False`, a view of the input array + may be returned when possible. + device (object): array API specification of device where the + output array is created. Device can be specified by + a filter selector string, an instance of + :class:`dpctl.SyclDevice`, an instance of + :class:`dpctl.SyclQueue`, or an instance of + :class:`dpctl.tensor.Device`. If the value is `None`, + returned array is created on the same device as `array`. + Default: `None`. + + Returns: + usm_ndarray: + An array with requested data type. + + A view can be returned, if possible, when `copy=False` is used. + """ + if not isinstance(usm_ary, dpt.usm_ndarray): + return TypeError( + f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}" + ) + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() + ary_dtype = usm_ary.dtype + if device is not None: + if not isinstance(device, dpctl.SyclQueue): + if isinstance(device, dpt.Device): + device = device.sycl_queue + else: + device = dpt.Device.create_device(device).sycl_queue + d = device.sycl_device + target_dtype = _get_dtype(newdtype, device) + if not _dtype_supported_by_device_impl( + target_dtype, d.has_aspect_fp16, d.has_aspect_fp64 + ): + raise ValueError( + f"Requested dtype '{target_dtype}' is not supported by the " + "target device" + ) + usm_ary = usm_ary.to_device(device) + else: + target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue) + + if not dpt.can_cast(ary_dtype, target_dtype, casting=casting): + raise TypeError( + f"Can not cast from {ary_dtype} to {newdtype} " + f"according to rule {casting}." + ) + c_contig = usm_ary.flags.c_contiguous + f_contig = usm_ary.flags.f_contiguous + needs_copy = copy or not ary_dtype == target_dtype + if not needs_copy and (order != "K"): + # ensure that order="F" for C-contig input triggers copy, + # and order="C" for F-contig input triggers copy too. + # 1D arrays which are both C- and F- contig should not + # force copying for neither order="F", nor order="C", see gh-1926 + needs_copy = ( + c_contig and not f_contig and order not in ["A", "C"] + ) or (not c_contig and f_contig and order not in ["A", "F"]) + if not needs_copy: + return usm_ary + copy_order = "C" + if order == "C": + pass + elif order == "F": + copy_order = order + elif order == "A": + if usm_ary.flags.f_contiguous: + copy_order = "F" + elif order == "K": + if usm_ary.flags.f_contiguous: + copy_order = "F" + else: + raise ValueError( + "Unrecognized value of the order keyword. " + "Recognized values are 'A', 'C', 'F', or 'K'" + ) + if order == "K": + R = _empty_like_orderK(usm_ary, target_dtype) + else: + R = dpt.usm_ndarray( + usm_ary.shape, + dtype=target_dtype, + buffer=usm_ary.usm_type, + order=copy_order, + buffer_ctor_kwargs={"queue": usm_ary.sycl_queue}, + ) + _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary) + return R From fd18db07dc59fa927e14d39f730fce9d6c2b42ec Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 05:24:47 -0800 Subject: [PATCH 40/50] reuse astype(), copy() from dpctl_ext --- dpctl_ext/tensor/_ctors.py | 5 ++++- dpctl_ext/tensor/_indexing_functions.py | 5 ++++- dpnp/dpnp_algo/dpnp_arraycreation.py | 5 ++++- dpnp/dpnp_algo/dpnp_elementwise_common.py | 13 +++++++------ dpnp/dpnp_algo/dpnp_fill.py | 6 +++--- dpnp/dpnp_container.py | 4 +++- dpnp/dpnp_iface_arraycreation.py | 7 +++++-- dpnp/dpnp_iface_indexing.py | 15 ++++++++------- dpnp/dpnp_iface_statistics.py | 5 ++++- 9 files changed, 42 insertions(+), 23 deletions(-) diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index a0e7b28e66ff..5a39e9367e9c 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -36,6 +36,9 @@ from dpctl.tensor._data_types import _get_dtype from dpctl.tensor._device import normalize_queue_device +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti @@ -147,7 +150,7 @@ def full( usm_type=usm_type, sycl_queue=sycl_queue, ) - return dpt.copy(dpt.broadcast_to(X, shape), order=order) + return dpt_ext.copy(dpt.broadcast_to(X, shape), order=order) else: _validate_fill_value(fill_value) diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index 106df09cf97e..df4f3e953042 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -32,6 +32,9 @@ import dpctl.tensor as dpt import dpctl.utils +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti from ._numpy_helper import normalize_axis_index @@ -185,7 +188,7 @@ def put_vec_duplicates(vec, ind, vals): if vals.dtype == x.dtype: rhs = vals else: - rhs = dpt.astype(vals, x.dtype) + rhs = dpt_ext.astype(vals, x.dtype) rhs = dpt.broadcast_to(rhs, val_shape) _manager = dpctl.utils.SequentialOrderManager[exec_q] diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index d94a031801f3..27f095158a85 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -33,6 +33,9 @@ import dpctl.utils as dpu import numpy +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_array import dpnp_array from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device @@ -256,7 +259,7 @@ def dpnp_linspace( if dpnp.issubdtype(dtype, dpnp.integer): dpt.floor(usm_res, out=usm_res) - res = dpt.astype(usm_res, dtype, copy=False) + res = dpt_ext.astype(usm_res, dtype, copy=False) res = dpnp_array._create_from_usm_ndarray(res) if retstep is True: diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index 88abcee5035c..55d74e8c1803 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -47,6 +47,7 @@ # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi @@ -212,7 +213,7 @@ def __call__( x_usm = dpnp.get_usm_ndarray(x) if dtype is not None: - x_usm = dpt.astype(x_usm, dtype, copy=False) + x_usm = dpt_ext.astype(x_usm, dtype, copy=False) out = self._unpack_out_kw(out) out_usm = None if out is None else dpnp.get_usm_ndarray(out) @@ -718,9 +719,9 @@ def __call__( sycl_queue=x2.sycl_queue, usm_type=x2.usm_type, ) - x2_usm = dpt.astype(x2_usm, dtype, copy=False) + x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False) elif dpnp.isscalar(x2): - x1_usm = dpt.astype(x1_usm, dtype, copy=False) + x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False) x2_usm = dpt.asarray( x2, dtype=dtype, @@ -728,8 +729,8 @@ def __call__( usm_type=x1.usm_type, ) else: - x1_usm = dpt.astype(x1_usm, dtype, copy=False) - x2_usm = dpt.astype(x2_usm, dtype, copy=False) + x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False) + x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False) res_usm = super().__call__(x1_usm, x2_usm, out=out_usm, order=order) @@ -1325,7 +1326,7 @@ def __call__(self, x, /, decimals=0, out=None, *, dtype=None): res_usm = dpt.divide(x_usm, 10**decimals, out=out_usm) if dtype is not None: - res_usm = dpt.astype(res_usm, dtype, copy=False) + res_usm = dpt_ext.astype(res_usm, dtype, copy=False) if out is not None and isinstance(out, dpnp_array): return out diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index 4137a2794747..ddba9f634cb1 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -32,10 +32,10 @@ import dpctl.utils as dpu from dpctl.tensor._ctors import _cast_fill_val -import dpnp - # TODO: revert to `from dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext +import dpnp from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _full_usm_ndarray, @@ -56,7 +56,7 @@ def dpnp_fill(arr, val): raise dpu.ExecutionPlacementError( "Input arrays have incompatible queues." ) - a_val = dpt.astype(val, arr.dtype) + a_val = dpt_ext.astype(val, arr.dtype) a_val = dpt.broadcast_to(a_val, arr.shape) _manager = dpu.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index c8e28529cd57..acda579a5f5e 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -38,6 +38,8 @@ import dpctl.tensor as dpt import dpctl.utils as dpu +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_array import dpnp_array @@ -141,7 +143,7 @@ def copy(x1, /, *, order="K"): if order is None: order = "K" - array_obj = dpt.copy(dpnp.get_usm_ndarray(x1), order=order) + array_obj = dpt_ext.copy(dpnp.get_usm_ndarray(x1), order=order) return dpnp_array._create_from_usm_ndarray(array_obj) diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py index 8d4ebdd1a6c2..404d9e0fe37d 100644 --- a/dpnp/dpnp_iface_arraycreation.py +++ b/dpnp/dpnp_iface_arraycreation.py @@ -46,6 +46,9 @@ import dpctl.tensor as dpt import numpy +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp import dpnp_container @@ -934,7 +937,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None): order = "K" usm_x = dpnp.get_usm_ndarray(x) - usm_res = dpt.astype( + usm_res = dpt_ext.astype( usm_x, dtype, order=order, casting=casting, copy=copy, device=device ) @@ -3129,7 +3132,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"): output = dpt.broadcast_arrays(*output) if copy: - output = [dpt.copy(x) for x in output] + output = [dpt_ext.copy(x) for x in output] return [dpnp_array._create_from_usm_ndarray(x) for x in output] diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index a01a036e16cc..a77877d8d685 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -51,11 +51,10 @@ from dpctl.tensor._indexing_functions import _get_indexing_mode from dpctl.tensor._numpy_helper import normalize_axis_index -import dpctl_ext.tensor as dpt_ext - # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti import dpnp @@ -243,7 +242,7 @@ def choose(a, choices, out=None, mode="wrap"): # NumPy will cast up to int64 in general but # int32 is more than safe for bool if ind_dt == dpnp.bool: - inds = dpt.astype(inds, dpt.int32) + inds = dpt_ext.astype(inds, dpt.int32) else: raise TypeError("input index array must be of integer data type") @@ -256,7 +255,7 @@ def choose(a, choices, out=None, mode="wrap"): choices = tuple( map( lambda chc: ( - chc if chc.dtype == res_dt else dpt.astype(chc, res_dt) + chc if chc.dtype == res_dt else dpt_ext.astype(chc, res_dt) ), choices, ) @@ -1616,7 +1615,9 @@ def place(a, mask, vals): if usm_vals.dtype != usm_a.dtype: # dpt.place casts values to a.dtype with "unsafe" rule, # while numpy.place does that with "safe" casting rule - usm_vals = dpt.astype(usm_vals, usm_a.dtype, casting="safe", copy=False) + usm_vals = dpt_ext.astype( + usm_vals, usm_a.dtype, casting="safe", copy=False + ) dpt.place(usm_a, usm_mask, usm_vals) @@ -1712,7 +1713,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"): if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer): # dpt.put supports only integer dtype for array of indices - usm_ind = dpt.astype(usm_ind, dpnp.intp, casting="safe") + usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, casting="safe") in_usm_a = usm_a if axis is None and usm_a.ndim > 1: @@ -2171,7 +2172,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"): if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer): # dpt.take supports only integer dtype for array of indices - usm_ind = dpt.astype(usm_ind, dpnp.intp, copy=False, casting="safe") + usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, copy=False, casting="safe") usm_res = _take_index( usm_a, usm_ind, axis, exec_q, res_usm_type, out=out, mode=mode diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py index 7e092184366c..daff981d5cc4 100644 --- a/dpnp/dpnp_iface_statistics.py +++ b/dpnp/dpnp_iface_statistics.py @@ -47,6 +47,9 @@ import numpy from dpctl.tensor._numpy_helper import normalize_axis_index +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp # pylint: disable=no-name-in-module @@ -1204,7 +1207,7 @@ def mean(a, /, axis=None, dtype=None, out=None, keepdims=False, *, where=True): usm_a = dpnp.get_usm_ndarray(a) usm_res = dpt.mean(usm_a, axis=axis, keepdims=keepdims) if dtype is not None: - usm_res = dpt.astype(usm_res, dtype) + usm_res = dpt_ext.astype(usm_res, dtype) return dpnp.get_result_array(usm_res, out, casting="unsafe") From 09761710153cee6060ffe27461af6444d3e4699b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:00:04 -0800 Subject: [PATCH 41/50] Move _copy_usm_ndarray_for_reshape --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../libtensor/source/copy_for_reshape.cpp | 184 ++++++++++++++++++ .../libtensor/source/copy_for_reshape.hpp | 54 +++++ .../tensor/libtensor/source/tensor_ctors.cpp | 17 +- 4 files changed, 248 insertions(+), 9 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 4e3fa580f99e..b30a1ac8b029 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -49,7 +49,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp new file mode 100644 index 000000000000..524bfcfdb98b --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp @@ -0,0 +1,184 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "copy_for_reshape.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +// define static vector +static copy_for_reshape_fn_ptr_t + copy_for_reshape_generic_dispatch_vector[td_ns::num_types]; + +/* + * Copies src into dst (same data type) of different shapes by using flat + * iterations. + * + * Equivalent to the following loop: + * + * for i for range(src.size): + * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] + */ +std::pair + copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + py::ssize_t src_nelems = src.get_size(); + py::ssize_t dst_nelems = dst.get_size(); + + // Must have the same number of elements + if (src_nelems != dst_nelems) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same number of elements."); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check same contexts + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + // dimensions may be different + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + auto fn = copy_for_reshape_generic_dispatch_vector[type_id]; + + auto src_shape = src.get_shape_vector(); + auto src_strides = src.get_strides_vector(); + + auto dst_shape = dst.get_shape_vector(); + auto dst_strides = dst.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_shape, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, src_shape, src_strides, dst_shape, + dst_strides); + auto copy_shape_ev = std::get<2>(ptr_size_event_tuple); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_reshape_event = + fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data, + dst_data, all_deps); + + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_reshape_event}, shape_strides_owner); + + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_reshape_event); +} + +void init_copy_for_reshape_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp new file mode 100644 index 000000000000..c5af885ad6cd --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp @@ -0,0 +1,54 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_for_reshape_dispatch_vectors(); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 07c8fd1bf99f..4903f22bd481 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -51,7 +51,7 @@ // #include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_as_contig.hpp" -// #include "copy_for_reshape.hpp" +#include "copy_for_reshape.hpp" // #include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" @@ -87,7 +87,7 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* =========================== Copy for reshape ============================= */ -// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; /* =========================== Copy for roll ============================= */ @@ -279,12 +279,13 @@ PYBIND11_MODULE(_tensor_impl, m) }, ""); - // m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, - // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same - // " "number of elements using underlying 'C'-contiguous order for - // flat " "traversal. " "Returns a tuple of events: (ht_event, - // comp_event)", py::arg("src"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "number of elements using underlying 'C'-contiguous order for flat " + "traversal. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same From 318692ef0f33820ddf5850ddf5a9a0a030b139a9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:26:33 -0800 Subject: [PATCH 42/50] Move reshape() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_reshape.py | 206 +++++++++++++++++++++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 dpctl_ext/tensor/_reshape.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index a02cf85ed591..416216074f8a 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -43,6 +43,7 @@ put, take, ) +from dpctl_ext.tensor._reshape import reshape __all__ = [ "asnumpy", @@ -51,6 +52,7 @@ "from_numpy", "full", "put", + "reshape", "take", "to_numpy", "tril", diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py new file mode 100644 index 000000000000..6afa1dc245c3 --- /dev/null +++ b/dpctl_ext/tensor/_reshape.py @@ -0,0 +1,206 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +import dpctl.tensor as dpt +import dpctl.utils +import numpy as np +from dpctl.tensor._tensor_impl import ( + _copy_usm_ndarray_for_reshape, + _ravel_multi_index, + _unravel_index, +) + +__doc__ = "Implementation module for :func:`dpctl.tensor.reshape`." + + +def _make_unit_indexes(shape): + """ + Construct a diagonal matrix with with one on the diagonal + except if the corresponding element of shape is 1. + """ + nd = len(shape) + mi = np.zeros((nd, nd), dtype="u4") + for i, dim in enumerate(shape): + mi[i, i] = 1 if dim > 1 else 0 + return mi + + +def ti_unravel_index(flat_index, shape, order="C"): + return _unravel_index(flat_index, shape, order) + + +def ti_ravel_multi_index(multi_index, shape, order="C"): + return _ravel_multi_index(multi_index, shape, order) + + +def reshaped_strides(old_sh, old_sts, new_sh, order="C"): + """ + When reshaping array with `old_sh` shape and `old_sts` strides + into the new shape `new_sh`, returns the new stride if the reshape + can be a view, otherwise returns `None`. + """ + eye_new_mi = _make_unit_indexes(new_sh) + new_sts = [ + sum( + st_i * ind_i + for st_i, ind_i in zip( + old_sts, ti_unravel_index(flat_index, old_sh, order=order) + ) + ) + for flat_index in [ + ti_ravel_multi_index(unitvec, new_sh, order=order) + for unitvec in eye_new_mi + ] + ] + eye_old_mi = _make_unit_indexes(old_sh) + check_sts = [ + sum( + st_i * ind_i + for st_i, ind_i in zip( + new_sts, ti_unravel_index(flat_index, new_sh, order=order) + ) + ) + for flat_index in [ + ti_ravel_multi_index(unitvec, old_sh, order=order) + for unitvec in eye_old_mi + ] + ] + valid = all( + check_st == old_st or old_dim == 1 + for check_st, old_st, old_dim in zip(check_sts, old_sts, old_sh) + ) + return new_sts if valid else None + + +def reshape(X, /, shape, *, order="C", copy=None): + """reshape(x, shape, order="C") + + Reshapes array ``x`` into new shape. + + Args: + x (usm_ndarray): + input array + shape (Tuple[int]): + the desired shape of the resulting array. + order ("C", "F", optional): + memory layout of the resulting array + if a copy is found to be necessary. Supported + choices are ``"C"`` for C-contiguous, or row-major layout; + and ``"F"`` for F-contiguous, or column-major layout. + + Returns: + usm_ndarray: + Reshaped array is a view, if possible, + and a copy otherwise with memory layout as indicated + by ``order`` keyword. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError + if not isinstance(shape, (list, tuple)): + shape = (shape,) + if order in "cfCF": + order = order.upper() + else: + raise ValueError( + f"Keyword 'order' not recognized. Expecting 'C' or 'F', got {order}" + ) + if copy not in (True, False, None): + raise ValueError( + f"Keyword 'copy' not recognized. Expecting True, False, " + f"or None, got {copy}" + ) + shape = [operator.index(d) for d in shape] + negative_ones_count = 0 + for nshi in shape: + if nshi == -1: + negative_ones_count = negative_ones_count + 1 + if (nshi < -1) or negative_ones_count > 1: + raise ValueError( + "Target shape should have at most 1 negative " + "value which can only be -1" + ) + if negative_ones_count: + sz = -np.prod(shape) + if sz == 0: + raise ValueError( + f"Can not reshape array of size {X.size} into " + f"shape {tuple(i for i in shape if i >= 0)}" + ) + v = X.size // sz + shape = [v if d == -1 else d for d in shape] + if X.size != np.prod(shape): + raise ValueError(f"Can not reshape into {shape}") + if X.size: + newsts = reshaped_strides(X.shape, X.strides, shape, order=order) + else: + newsts = (1,) * len(shape) + copy_required = newsts is None + if copy_required and (copy is False): + raise ValueError( + "Reshaping the array requires a copy, but no copying was " + "requested by using copy=False" + ) + copy_q = X.sycl_queue + if copy_required or (copy is True): + # must perform a copy + copy_q = X.sycl_queue + flat_res = dpt.usm_ndarray( + (X.size,), + dtype=X.dtype, + buffer=X.usm_type, + buffer_ctor_kwargs={"queue": copy_q}, + ) + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_evs = _manager.submitted_events + if order == "C": + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs + ) + else: + X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1)) + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs + ) + _manager.add_event_pair(hev, r_e) + return dpt.usm_ndarray( + tuple(shape), dtype=X.dtype, buffer=flat_res, order=order + ) + # can form a view + if (len(shape) == X.ndim) and all( + s1 == s2 for s1, s2 in zip(shape, X.shape) + ): + return X + return dpt.usm_ndarray( + shape, + dtype=X.dtype, + buffer=X, + strides=tuple(newsts), + offset=X._element_offset, + ) From 3c0c1132df85fc9ae76e6c953392ba674e0dd4f3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:27:16 -0800 Subject: [PATCH 43/50] Reuse reshape from dpctl_ext in dpnp --- dpnp/dpnp_algo/dpnp_arraycreation.py | 4 +++- dpnp/dpnp_iface_arraycreation.py | 8 ++++---- dpnp/dpnp_iface_indexing.py | 24 ++++++++++++------------ dpnp/dpnp_iface_manipulation.py | 9 ++++++--- dpnp/dpnp_iface_sorting.py | 5 ++++- dpnp/tests/test_arraycreation.py | 5 ++++- 6 files changed, 33 insertions(+), 22 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index 27f095158a85..47edf63a68b4 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -233,7 +233,9 @@ def dpnp_linspace( usm_type=_usm_type, sycl_queue=sycl_queue_normalized, ) - usm_res = dpt.reshape(usm_res, (-1,) + (1,) * delta.ndim, copy=False) + usm_res = dpt_ext.reshape( + usm_res, (-1,) + (1,) * delta.ndim, copy=False + ) if step_num > 0: step = delta / step_num diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py index 404d9e0fe37d..c2dd5793f827 100644 --- a/dpnp/dpnp_iface_arraycreation.py +++ b/dpnp/dpnp_iface_arraycreation.py @@ -3117,7 +3117,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"): s0 = (1,) * ndim output = [ - dpt.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :]) + dpt_ext.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :]) for i, x in enumerate(xi) ] @@ -3125,8 +3125,8 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"): _, _ = get_usm_allocations(output) if indexing == "xy" and ndim > 1: - output[0] = dpt.reshape(output[0], (1, -1) + s0[2:]) - output[1] = dpt.reshape(output[1], (-1, 1) + s0[2:]) + output[0] = dpt_ext.reshape(output[0], (1, -1) + s0[2:]) + output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:]) if not sparse: output = dpt.broadcast_arrays(*output) @@ -3932,7 +3932,7 @@ def vander( tmp = m[:, ::-1] if not increasing else m dpnp.power( - dpt.reshape(usm_x, (-1, 1)), + dpt_ext.reshape(usm_x, (-1, 1)), dpt.arange( N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue ), diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index a77877d8d685..439ec288ebe2 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -814,14 +814,14 @@ def extract(condition, a): ) if usm_cond.size != usm_a.size: - usm_a = dpt.reshape(usm_a, -1) - usm_cond = dpt.reshape(usm_cond, -1) + usm_a = dpt_ext.reshape(usm_a, -1) + usm_cond = dpt_ext.reshape(usm_cond, -1) usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0]) else: if usm_cond.shape != usm_a.shape: - usm_a = dpt.reshape(usm_a, -1) - usm_cond = dpt.reshape(usm_cond, -1) + usm_a = dpt_ext.reshape(usm_a, -1) + usm_cond = dpt_ext.reshape(usm_cond, -1) usm_res = dpt.extract(usm_cond, usm_a) @@ -958,18 +958,18 @@ def fill_diagonal(a, val, wrap=False): # a.flat[:end:step] = val # but need to consider use case when `a` is usm_ndarray also a_sh = a.shape - tmp_a = dpt.reshape(usm_a, -1) + tmp_a = dpt_ext.reshape(usm_a, -1) if dpnp.isscalar(usm_val): tmp_a[:end:step] = usm_val else: - usm_val = dpt.reshape(usm_val, -1) + usm_val = dpt_ext.reshape(usm_val, -1) # Setitem can work only if index size equal val size. # Using loop for general case without dependencies of val size. for i in range(0, usm_val.size): tmp_a[step * i : end : step * (i + 1)] = usm_val[i] - tmp_a = dpt.reshape(tmp_a, a_sh) + tmp_a = dpt_ext.reshape(tmp_a, a_sh) usm_a[:] = tmp_a @@ -1610,7 +1610,7 @@ def place(a, mask, vals): if usm_vals.ndim != 1: # dpt.place supports only 1-D array of values - usm_vals = dpt.reshape(usm_vals, -1) + usm_vals = dpt_ext.reshape(usm_vals, -1) if usm_vals.dtype != usm_a.dtype: # dpt.place casts values to a.dtype with "unsafe" rule, @@ -1709,7 +1709,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"): if usm_ind.ndim != 1: # dpt.put supports only 1-D array of indices - usm_ind = dpt.reshape(usm_ind, -1, copy=False) + usm_ind = dpt_ext.reshape(usm_ind, -1, copy=False) if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer): # dpt.put supports only integer dtype for array of indices @@ -1717,11 +1717,11 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"): in_usm_a = usm_a if axis is None and usm_a.ndim > 1: - usm_a = dpt.reshape(usm_a, -1) + usm_a = dpt_ext.reshape(usm_a, -1) dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode) if in_usm_a._pointer != usm_a._pointer: # pylint: disable=protected-access - in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False) + in_usm_a[:] = dpt_ext.reshape(usm_a, in_usm_a.shape, copy=False) def put_along_axis(a, ind, values, axis, mode="wrap"): @@ -2163,7 +2163,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"): if axis is None: if a_ndim > 1: # flatten input array - usm_a = dpt.reshape(usm_a, -1) + usm_a = dpt_ext.reshape(usm_a, -1) axis = 0 elif a_ndim == 0: axis = normalize_axis_index(operator.index(axis), 1) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 9df5278bd16b..f992cc366e20 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -53,6 +53,9 @@ normalize_axis_tuple, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .dpnp_array import dpnp_array @@ -415,7 +418,7 @@ def _get_first_nan_index(usm_a): dpt.place( usm_res.inverse_indices, usm_res.inverse_indices > first_nan, - dpt.reshape(first_nan, 1), + dpt_ext.reshape(first_nan, 1), ) result += (usm_res.inverse_indices,) @@ -3057,7 +3060,7 @@ def reshape(a, /, shape, order="C", *, copy=None): ) usm_a = dpnp.get_usm_ndarray(a) - usm_res = dpt.reshape(usm_a, shape=shape, order=order, copy=copy) + usm_res = dpt_ext.reshape(usm_a, shape=shape, order=order, copy=copy) return dpnp_array._create_from_usm_ndarray(usm_res) @@ -3259,7 +3262,7 @@ def roll(x, shift, axis=None): shift = dpnp.asnumpy(shift) if axis is None: - return roll(dpt.reshape(usm_x, -1), shift, 0).reshape(x.shape) + return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape) usm_res = dpt.roll(usm_x, shift=shift, axis=axis) return dpnp_array._create_from_usm_ndarray(usm_res) diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py index db33a88c7488..3e5cdd4da6a6 100644 --- a/dpnp/dpnp_iface_sorting.py +++ b/dpnp/dpnp_iface_sorting.py @@ -43,6 +43,9 @@ import numpy from dpctl.tensor._numpy_helper import normalize_axis_index +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp # pylint: disable=no-name-in-module @@ -84,7 +87,7 @@ def _wrap_sort_argsort( usm_a = dpnp.get_usm_ndarray(a) if axis is None: - usm_a = dpt.reshape(usm_a, -1) + usm_a = dpt_ext.reshape(usm_a, -1) axis = -1 axis = normalize_axis_index(axis, ndim=usm_a.ndim) diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py index eb20f9b3ffe5..423004470bad 100644 --- a/dpnp/tests/test_arraycreation.py +++ b/dpnp/tests/test_arraycreation.py @@ -13,6 +13,9 @@ assert_raises_regex, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .helper import ( @@ -969,7 +972,7 @@ def test_ones_like(array, dtype, order): ], ) def test_dpctl_tensor_input(func, args): - x0 = dpt.reshape(dpt.arange(9), (3, 3)) + x0 = dpt_ext.reshape(dpt.arange(9), (3, 3)) new_args = [eval(val, {"x0": x0}) for val in args] X = getattr(dpt, func)(*new_args) Y = getattr(dpnp, func)(*new_args) From 30f2c53ee0d6dcddfbc8d0933a667705b048e8b1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:44:48 -0800 Subject: [PATCH 44/50] Move _copy_usm_ndarray_for_roll --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../tensor/libtensor/source/copy_for_roll.cpp | 400 ++++++++++++++++++ .../tensor/libtensor/source/copy_for_roll.hpp | 65 +++ .../tensor/libtensor/source/tensor_ctors.cpp | 38 +- 4 files changed, 485 insertions(+), 20 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index b30a1ac8b029..a39368c7baa3 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -50,7 +50,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp new file mode 100644 index 000000000000..a187b2247677 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp @@ -0,0 +1,400 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "copy_for_roll.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast:: + copy_for_roll_ndshift_strided_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +// define static vector +static copy_for_roll_strided_fn_ptr_t + copy_for_roll_strided_dispatch_vector[td_ns::num_types]; + +static copy_for_roll_contig_fn_ptr_t + copy_for_roll_contig_dispatch_vector[td_ns::num_types]; + +static copy_for_roll_ndshift_strided_fn_ptr_t + copy_for_roll_ndshift_dispatch_vector[td_ns::num_types]; + +/* + * Copies src into dst (same data type) of different shapes by using flat + * iterations. + * + * Equivalent to the following loop: + * + * for i for range(src.size): + * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] + */ +std::pair + copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + py::ssize_t shift, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + // Must have the same number of dimensions + if (src_nd != dst_nd) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same number of dimensions."); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same shape."); + } + + py::ssize_t src_nelems = src.get_size(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check same contexts + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + const bool is_src_c_contig = src.is_c_contiguous(); + const bool is_src_f_contig = src.is_f_contiguous(); + + const bool is_dst_c_contig = dst.is_c_contiguous(); + const bool is_dst_f_contig = dst.is_f_contiguous(); + + const bool both_c_contig = is_src_c_contig && is_dst_c_contig; + const bool both_f_contig = is_src_f_contig && is_dst_f_contig; + + // normalize shift parameter to be 0 <= offset < src_nelems + std::size_t offset = + (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + if (both_c_contig || both_f_contig) { + auto fn = copy_for_roll_contig_dispatch_vector[type_id]; + + if (fn != nullptr) { + static constexpr py::ssize_t zero_offset = 0; + + sycl::event copy_for_roll_ev = + fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data, + zero_offset, depends); + + sycl::event ht_ev = + keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev}); + + return std::make_pair(ht_ev, copy_for_roll_ev); + } + } + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape_ptr; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (nd == 1 && simplified_src_strides[0] == 1 && + simplified_dst_strides[0] == 1) { + auto fn = copy_for_roll_contig_dispatch_vector[type_id]; + + if (fn != nullptr) { + + sycl::event copy_for_roll_ev = + fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data, + dst_offset, depends); + + sycl::event ht_ev = + keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev}); + + return std::make_pair(ht_ev, copy_for_roll_ev); + } + } + + auto fn = copy_for_roll_strided_dispatch_vector[type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_roll_event = + fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data, + src_offset, dst_data, dst_offset, all_deps); + + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_roll_event); +} + +std::pair + copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const std::vector &shifts, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + // Must have the same number of dimensions + if (src_nd != dst_nd) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same number of dimensions."); + } + + if (static_cast(src_nd) != shifts.size()) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires shifts to " + "contain an integral shift for each array dimension."); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same shape."); + } + + py::ssize_t src_nelems = src.get_size(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check for compatible queues + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + std::vector normalized_shifts{}; + normalized_shifts.reserve(src_nd); + + for (int i = 0; i < src_nd; ++i) { + // normalize shift parameter to be 0 <= offset < dim + py::ssize_t dim = src_shape_ptr[i]; + std::size_t offset = + (shifts[i] >= 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim); + + normalized_shifts.push_back(offset); + } + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + auto const &common_shape = src.get_shape_vector(); + + static constexpr py::ssize_t src_offset = 0; + static constexpr py::ssize_t dst_offset = 0; + + auto fn = copy_for_roll_ndshift_dispatch_vector[type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, common_shape, src_strides, dst_strides, + normalized_shifts); + auto shape_strides_shifts_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_roll_event = + fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data, + src_offset, dst_data, dst_offset, all_deps); + + auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_shifts_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_roll_event); +} + +void init_copy_for_roll_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector); + + using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector); + + using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory; + DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp new file mode 100644 index 000000000000..cffbf9f6f0d6 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp @@ -0,0 +1,65 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + py::ssize_t shift, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern std::pair + copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const std::vector &shifts, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_for_roll_dispatch_vectors(); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 4903f22bd481..c1372c1c2406 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -52,7 +52,7 @@ #include "copy_and_cast_usm_to_usm.hpp" #include "copy_as_contig.hpp" #include "copy_for_reshape.hpp" -// #include "copy_for_roll.hpp" +#include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" // #include "eye_ctor.hpp" @@ -91,8 +91,8 @@ using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; /* =========================== Copy for roll ============================= */ -// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; -// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ @@ -158,8 +158,8 @@ void init_dispatch_vectors(void) using namespace dpctl::tensor::py_internal; init_copy_as_contig_dispatch_vectors(); - // init_copy_for_reshape_dispatch_vectors(); - // init_copy_for_roll_dispatch_vectors(); + init_copy_for_reshape_dispatch_vectors(); + init_copy_for_roll_dispatch_vectors(); // init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); @@ -287,21 +287,21 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, - // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same - // " "shapes using underlying 'C'-contiguous order for flat " - // "traversal with shift. " - // "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("src"), py::arg("dst"), py::arg("shift"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "shapes using underlying 'C'-contiguous order for flat " + "traversal with shift. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("shift"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, - // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same - // " "shapes using underlying 'C'-contiguous order for " "traversal - // with shifts along each axis. " "Returns a tuple of events: - // (ht_event, comp_event)", py::arg("src"), py::arg("dst"), - // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = - // py::list()); + m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "shapes using underlying 'C'-contiguous order for " + "traversal with shifts along each axis. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("shifts"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, // "Fills input 1D contiguous usm_ndarray `dst` with linear From 85c29daa5b6855db9d3b023e29777950aebaba18 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:57:28 -0800 Subject: [PATCH 45/50] Move roll() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_manipulation_functions.py | 120 ++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 dpctl_ext/tensor/_manipulation_functions.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 416216074f8a..edb2c096bad1 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -43,6 +43,9 @@ put, take, ) +from dpctl_ext.tensor._manipulation_functions import ( + roll, +) from dpctl_ext.tensor._reshape import reshape __all__ = [ @@ -53,6 +56,7 @@ "full", "put", "reshape", + "roll", "take", "to_numpy", "tril", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py new file mode 100644 index 000000000000..fa8fc27876b3 --- /dev/null +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -0,0 +1,120 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +import dpctl.tensor as dpt +import dpctl.utils as dputils +import numpy as np + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_tuple + +__doc__ = ( + "Implementation module for array manipulation " + "functions in :module:`dpctl.tensor`" +) + + +def roll(x, /, shift, *, axis=None): + """ + roll(x, shift, axis) + + Rolls array elements along a specified axis. + Array elements that roll beyond the last position are re-introduced + at the first position. Array elements that roll beyond the first position + are re-introduced at the last position. + + Args: + x (usm_ndarray): input array + shift (Union[int, Tuple[int,...]]): number of places by which the + elements are shifted. If `shift` is a tuple, then `axis` must be a + tuple of the same size, and each of the given axes must be shifted + by the corresponding element in `shift`. If `shift` is an `int` + and `axis` a tuple, then the same `shift` must be used for all + specified axes. If a `shift` is positive, then array elements is + shifted positively (toward larger indices) along the dimension of + `axis`. + If a `shift` is negative, then array elements must be shifted + negatively (toward smaller indices) along the dimension of `axis`. + axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along which + elements to shift. If `axis` is `None`, the array is + flattened, shifted, and then restored to its original shape. + Default: `None`. + + Returns: + usm_ndarray: + An array having the same `dtype`, `usm_type` and + `device` attributes as `x` and whose elements are shifted relative + to `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") + exec_q = x.sycl_queue + _manager = dputils.SequentialOrderManager[exec_q] + if axis is None: + shift = operator.index(shift) + res = dpt.empty( + x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + sz = operator.index(x.size) + shift = (shift % sz) if sz > 0 else 0 + dep_evs = _manager.submitted_events + hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d( + src=x, + dst=res, + shift=shift, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, roll_ev) + return res + axis = normalize_axis_tuple(axis, x.ndim, allow_duplicate=True) + broadcasted = np.broadcast(shift, axis) + if broadcasted.ndim > 1: + raise ValueError("'shift' and 'axis' should be scalars or 1D sequences") + shifts = [ + 0, + ] * x.ndim + shape = x.shape + for sh, ax in broadcasted: + n_i = operator.index(shape[ax]) + shifted = shifts[ax] + operator.index(sh) + shifts[ax] = (shifted % n_i) if n_i > 0 else 0 + res = dpt.empty( + x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + dep_evs = _manager.submitted_events + ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd( + src=x, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_e, roll_ev) + return res From 6e8d857a5e85894a11c16a728ec437fac629a196 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:58:11 -0800 Subject: [PATCH 46/50] Update dpnp.roll to use dpctl_ext --- dpnp/dpnp_iface_manipulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index f992cc366e20..edd98348afe3 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -3264,7 +3264,7 @@ def roll(x, shift, axis=None): if axis is None: return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape) - usm_res = dpt.roll(usm_x, shift=shift, axis=axis) + usm_res = dpt_ext.roll(usm_x, shift=shift, axis=axis) return dpnp_array._create_from_usm_ndarray(usm_res) From 19e93b99c7c2c238f1b697dfefe5b70525370819 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 18 Feb 2026 04:34:09 -0800 Subject: [PATCH 47/50] Update .gitignore to ignore .so files in dpctl_ext --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 5d2725d3186f..0cfebe53f623 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,7 @@ dpnp/**/*.cpython*.so dpnp/**/*.pyd *~ core + +# TODO: revert to `dpctl/` +# when dpnp fully migrates dpctl/tensor +dpctl_ext/**/*.cpython*.so From b111e49b784168180c835569d5dbe97958521f16 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 18 Feb 2026 04:35:23 -0800 Subject: [PATCH 48/50] Remove unused includes in tensor_ctors.cpp --- dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index be69ee1a8c7e..54d6adbc8f6e 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -32,18 +32,15 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -// #include -// #include -// #include -#include -#include -#include -// #include #include #include #include #include +#include +#include +#include + #include "dpnp4pybind11.hpp" // #include "accumulators.hpp" From c082224e07df5e4d4960112ef5ec4e5faef2a452 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 18 Feb 2026 05:40:59 -0800 Subject: [PATCH 49/50] Use Python::Module for dpctl_ext static lib to avoid libpython dependency --- dpctl_ext/tensor/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 28e7a4cb55f4..ed69b4f10cba 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -27,7 +27,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -find_package(Python COMPONENTS Development) +find_package(Python COMPONENTS Development.Module) if(WIN32) if(${CMAKE_VERSION} VERSION_LESS "3.27") @@ -74,7 +74,7 @@ target_include_directories( # ${Dpctl_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include ) -target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Python) +target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Module) set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON) set(_py_trgts) From d6da9132db7c98901330b978516ba6229d5d1eb7 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 24 Feb 2026 15:36:39 -0800 Subject: [PATCH 50/50] Use import from dpctl_ext in _reshape.py --- dpctl_ext/tensor/_reshape.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py index 6afa1dc245c3..61aa6c9c754f 100644 --- a/dpctl_ext/tensor/_reshape.py +++ b/dpctl_ext/tensor/_reshape.py @@ -31,7 +31,10 @@ import dpctl.tensor as dpt import dpctl.utils import numpy as np -from dpctl.tensor._tensor_impl import ( + +# TODO: revert to `from dpctl.tensor._tensor_impl...` +# when dpnp fully migrates dpctl/tensor +from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_for_reshape, _ravel_multi_index, _unravel_index,